| File: | build/source/llvm/lib/Target/X86/X86ISelLowering.cpp |
| Warning: | line 45517, column 39 The result of the left shift is undefined due to shifting by '4294967291', which is greater or equal to the width of type 'int' |
Press '?' to see keyboard shortcuts
Keyboard shortcuts:
| 1 | //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// | |||
| 2 | // | |||
| 3 | // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. | |||
| 4 | // See https://llvm.org/LICENSE.txt for license information. | |||
| 5 | // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | |||
| 6 | // | |||
| 7 | //===----------------------------------------------------------------------===// | |||
| 8 | // | |||
| 9 | // This file defines the interfaces that X86 uses to lower LLVM code into a | |||
| 10 | // selection DAG. | |||
| 11 | // | |||
| 12 | //===----------------------------------------------------------------------===// | |||
| 13 | ||||
| 14 | #include "X86ISelLowering.h" | |||
| 15 | #include "MCTargetDesc/X86ShuffleDecode.h" | |||
| 16 | #include "X86.h" | |||
| 17 | #include "X86CallingConv.h" | |||
| 18 | #include "X86FrameLowering.h" | |||
| 19 | #include "X86InstrBuilder.h" | |||
| 20 | #include "X86IntrinsicsInfo.h" | |||
| 21 | #include "X86MachineFunctionInfo.h" | |||
| 22 | #include "X86TargetMachine.h" | |||
| 23 | #include "X86TargetObjectFile.h" | |||
| 24 | #include "llvm/ADT/SmallBitVector.h" | |||
| 25 | #include "llvm/ADT/SmallSet.h" | |||
| 26 | #include "llvm/ADT/Statistic.h" | |||
| 27 | #include "llvm/ADT/StringExtras.h" | |||
| 28 | #include "llvm/ADT/StringSwitch.h" | |||
| 29 | #include "llvm/Analysis/BlockFrequencyInfo.h" | |||
| 30 | #include "llvm/Analysis/ObjCARCUtil.h" | |||
| 31 | #include "llvm/Analysis/ProfileSummaryInfo.h" | |||
| 32 | #include "llvm/Analysis/VectorUtils.h" | |||
| 33 | #include "llvm/CodeGen/IntrinsicLowering.h" | |||
| 34 | #include "llvm/CodeGen/MachineFrameInfo.h" | |||
| 35 | #include "llvm/CodeGen/MachineFunction.h" | |||
| 36 | #include "llvm/CodeGen/MachineInstrBuilder.h" | |||
| 37 | #include "llvm/CodeGen/MachineJumpTableInfo.h" | |||
| 38 | #include "llvm/CodeGen/MachineLoopInfo.h" | |||
| 39 | #include "llvm/CodeGen/MachineModuleInfo.h" | |||
| 40 | #include "llvm/CodeGen/MachineRegisterInfo.h" | |||
| 41 | #include "llvm/CodeGen/TargetLowering.h" | |||
| 42 | #include "llvm/CodeGen/WinEHFuncInfo.h" | |||
| 43 | #include "llvm/IR/CallingConv.h" | |||
| 44 | #include "llvm/IR/Constants.h" | |||
| 45 | #include "llvm/IR/DerivedTypes.h" | |||
| 46 | #include "llvm/IR/DiagnosticInfo.h" | |||
| 47 | #include "llvm/IR/EHPersonalities.h" | |||
| 48 | #include "llvm/IR/Function.h" | |||
| 49 | #include "llvm/IR/GlobalAlias.h" | |||
| 50 | #include "llvm/IR/GlobalVariable.h" | |||
| 51 | #include "llvm/IR/IRBuilder.h" | |||
| 52 | #include "llvm/IR/Instructions.h" | |||
| 53 | #include "llvm/IR/Intrinsics.h" | |||
| 54 | #include "llvm/IR/PatternMatch.h" | |||
| 55 | #include "llvm/MC/MCAsmInfo.h" | |||
| 56 | #include "llvm/MC/MCContext.h" | |||
| 57 | #include "llvm/MC/MCExpr.h" | |||
| 58 | #include "llvm/MC/MCSymbol.h" | |||
| 59 | #include "llvm/Support/CommandLine.h" | |||
| 60 | #include "llvm/Support/Debug.h" | |||
| 61 | #include "llvm/Support/ErrorHandling.h" | |||
| 62 | #include "llvm/Support/KnownBits.h" | |||
| 63 | #include "llvm/Support/MathExtras.h" | |||
| 64 | #include "llvm/Target/TargetOptions.h" | |||
| 65 | #include <algorithm> | |||
| 66 | #include <bitset> | |||
| 67 | #include <cctype> | |||
| 68 | #include <numeric> | |||
| 69 | using namespace llvm; | |||
| 70 | ||||
| 71 | #define DEBUG_TYPE"x86-isel" "x86-isel" | |||
| 72 | ||||
| 73 | STATISTIC(NumTailCalls, "Number of tail calls")static llvm::Statistic NumTailCalls = {"x86-isel", "NumTailCalls" , "Number of tail calls"}; | |||
| 74 | ||||
| 75 | static cl::opt<int> ExperimentalPrefInnermostLoopAlignment( | |||
| 76 | "x86-experimental-pref-innermost-loop-alignment", cl::init(4), | |||
| 77 | cl::desc( | |||
| 78 | "Sets the preferable loop alignment for experiments (as log2 bytes) " | |||
| 79 | "for innermost loops only. If specified, this option overrides " | |||
| 80 | "alignment set by x86-experimental-pref-loop-alignment."), | |||
| 81 | cl::Hidden); | |||
| 82 | ||||
| 83 | static cl::opt<bool> MulConstantOptimization( | |||
| 84 | "mul-constant-optimization", cl::init(true), | |||
| 85 | cl::desc("Replace 'mul x, Const' with more effective instructions like " | |||
| 86 | "SHIFT, LEA, etc."), | |||
| 87 | cl::Hidden); | |||
| 88 | ||||
| 89 | static cl::opt<bool> ExperimentalUnorderedISEL( | |||
| 90 | "x86-experimental-unordered-atomic-isel", cl::init(false), | |||
| 91 | cl::desc("Use LoadSDNode and StoreSDNode instead of " | |||
| 92 | "AtomicSDNode for unordered atomic loads and " | |||
| 93 | "stores respectively."), | |||
| 94 | cl::Hidden); | |||
| 95 | ||||
| 96 | /// Call this when the user attempts to do something unsupported, like | |||
| 97 | /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike | |||
| 98 | /// report_fatal_error, so calling code should attempt to recover without | |||
| 99 | /// crashing. | |||
| 100 | static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, | |||
| 101 | const char *Msg) { | |||
| 102 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 103 | DAG.getContext()->diagnose( | |||
| 104 | DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); | |||
| 105 | } | |||
| 106 | ||||
| 107 | /// Returns true if a CC can dynamically exclude a register from the list of | |||
| 108 | /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on | |||
| 109 | /// the return registers. | |||
| 110 | static bool shouldDisableRetRegFromCSR(CallingConv::ID CC) { | |||
| 111 | switch (CC) { | |||
| 112 | default: | |||
| 113 | return false; | |||
| 114 | case CallingConv::X86_RegCall: | |||
| 115 | case CallingConv::PreserveMost: | |||
| 116 | case CallingConv::PreserveAll: | |||
| 117 | return true; | |||
| 118 | } | |||
| 119 | } | |||
| 120 | ||||
| 121 | /// Returns true if a CC can dynamically exclude a register from the list of | |||
| 122 | /// callee-saved-registers (TargetRegistryInfo::getCalleeSavedRegs()) based on | |||
| 123 | /// the parameters. | |||
| 124 | static bool shouldDisableArgRegFromCSR(CallingConv::ID CC) { | |||
| 125 | return CC == CallingConv::X86_RegCall; | |||
| 126 | } | |||
| 127 | ||||
| 128 | X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, | |||
| 129 | const X86Subtarget &STI) | |||
| 130 | : TargetLowering(TM), Subtarget(STI) { | |||
| 131 | bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87(); | |||
| 132 | MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0)); | |||
| 133 | ||||
| 134 | // Set up the TargetLowering object. | |||
| 135 | ||||
| 136 | // X86 is weird. It always uses i8 for shift amounts and setcc results. | |||
| 137 | setBooleanContents(ZeroOrOneBooleanContent); | |||
| 138 | // X86-SSE is even stranger. It uses -1 or 0 for vector masks. | |||
| 139 | setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); | |||
| 140 | ||||
| 141 | // For 64-bit, since we have so many registers, use the ILP scheduler. | |||
| 142 | // For 32-bit, use the register pressure specific scheduling. | |||
| 143 | // For Atom, always use ILP scheduling. | |||
| 144 | if (Subtarget.isAtom()) | |||
| 145 | setSchedulingPreference(Sched::ILP); | |||
| 146 | else if (Subtarget.is64Bit()) | |||
| 147 | setSchedulingPreference(Sched::ILP); | |||
| 148 | else | |||
| 149 | setSchedulingPreference(Sched::RegPressure); | |||
| 150 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 151 | setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); | |||
| 152 | ||||
| 153 | // Bypass expensive divides and use cheaper ones. | |||
| 154 | if (TM.getOptLevel() >= CodeGenOpt::Default) { | |||
| 155 | if (Subtarget.hasSlowDivide32()) | |||
| 156 | addBypassSlowDiv(32, 8); | |||
| 157 | if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit()) | |||
| 158 | addBypassSlowDiv(64, 32); | |||
| 159 | } | |||
| 160 | ||||
| 161 | // Setup Windows compiler runtime calls. | |||
| 162 | if (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()) { | |||
| 163 | static const struct { | |||
| 164 | const RTLIB::Libcall Op; | |||
| 165 | const char * const Name; | |||
| 166 | const CallingConv::ID CC; | |||
| 167 | } LibraryCalls[] = { | |||
| 168 | { RTLIB::SDIV_I64, "_alldiv", CallingConv::X86_StdCall }, | |||
| 169 | { RTLIB::UDIV_I64, "_aulldiv", CallingConv::X86_StdCall }, | |||
| 170 | { RTLIB::SREM_I64, "_allrem", CallingConv::X86_StdCall }, | |||
| 171 | { RTLIB::UREM_I64, "_aullrem", CallingConv::X86_StdCall }, | |||
| 172 | { RTLIB::MUL_I64, "_allmul", CallingConv::X86_StdCall }, | |||
| 173 | }; | |||
| 174 | ||||
| 175 | for (const auto &LC : LibraryCalls) { | |||
| 176 | setLibcallName(LC.Op, LC.Name); | |||
| 177 | setLibcallCallingConv(LC.Op, LC.CC); | |||
| 178 | } | |||
| 179 | } | |||
| 180 | ||||
| 181 | if (Subtarget.getTargetTriple().isOSMSVCRT()) { | |||
| 182 | // MSVCRT doesn't have powi; fall back to pow | |||
| 183 | setLibcallName(RTLIB::POWI_F32, nullptr); | |||
| 184 | setLibcallName(RTLIB::POWI_F64, nullptr); | |||
| 185 | } | |||
| 186 | ||||
| 187 | // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to | |||
| 188 | // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b. | |||
| 189 | // FIXME: Should we be limiting the atomic size on other configs? Default is | |||
| 190 | // 1024. | |||
| 191 | if (!Subtarget.canUseCMPXCHG8B()) | |||
| 192 | setMaxAtomicSizeInBitsSupported(32); | |||
| 193 | ||||
| 194 | setMaxDivRemBitWidthSupported(Subtarget.is64Bit() ? 128 : 64); | |||
| 195 | ||||
| 196 | setMaxLargeFPConvertBitWidthSupported(128); | |||
| 197 | ||||
| 198 | // Set up the register classes. | |||
| 199 | addRegisterClass(MVT::i8, &X86::GR8RegClass); | |||
| 200 | addRegisterClass(MVT::i16, &X86::GR16RegClass); | |||
| 201 | addRegisterClass(MVT::i32, &X86::GR32RegClass); | |||
| 202 | if (Subtarget.is64Bit()) | |||
| 203 | addRegisterClass(MVT::i64, &X86::GR64RegClass); | |||
| 204 | ||||
| 205 | for (MVT VT : MVT::integer_valuetypes()) | |||
| 206 | setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); | |||
| 207 | ||||
| 208 | // We don't accept any truncstore of integer registers. | |||
| 209 | setTruncStoreAction(MVT::i64, MVT::i32, Expand); | |||
| 210 | setTruncStoreAction(MVT::i64, MVT::i16, Expand); | |||
| 211 | setTruncStoreAction(MVT::i64, MVT::i8 , Expand); | |||
| 212 | setTruncStoreAction(MVT::i32, MVT::i16, Expand); | |||
| 213 | setTruncStoreAction(MVT::i32, MVT::i8 , Expand); | |||
| 214 | setTruncStoreAction(MVT::i16, MVT::i8, Expand); | |||
| 215 | ||||
| 216 | setTruncStoreAction(MVT::f64, MVT::f32, Expand); | |||
| 217 | ||||
| 218 | // SETOEQ and SETUNE require checking two conditions. | |||
| 219 | for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) { | |||
| 220 | setCondCodeAction(ISD::SETOEQ, VT, Expand); | |||
| 221 | setCondCodeAction(ISD::SETUNE, VT, Expand); | |||
| 222 | } | |||
| 223 | ||||
| 224 | // Integer absolute. | |||
| 225 | if (Subtarget.canUseCMOV()) { | |||
| 226 | setOperationAction(ISD::ABS , MVT::i16 , Custom); | |||
| 227 | setOperationAction(ISD::ABS , MVT::i32 , Custom); | |||
| 228 | if (Subtarget.is64Bit()) | |||
| 229 | setOperationAction(ISD::ABS , MVT::i64 , Custom); | |||
| 230 | } | |||
| 231 | ||||
| 232 | // Absolute difference. | |||
| 233 | for (auto Op : {ISD::ABDS, ISD::ABDU}) { | |||
| 234 | setOperationAction(Op , MVT::i8 , Custom); | |||
| 235 | setOperationAction(Op , MVT::i16 , Custom); | |||
| 236 | setOperationAction(Op , MVT::i32 , Custom); | |||
| 237 | if (Subtarget.is64Bit()) | |||
| 238 | setOperationAction(Op , MVT::i64 , Custom); | |||
| 239 | } | |||
| 240 | ||||
| 241 | // Signed saturation subtraction. | |||
| 242 | setOperationAction(ISD::SSUBSAT , MVT::i8 , Custom); | |||
| 243 | setOperationAction(ISD::SSUBSAT , MVT::i16 , Custom); | |||
| 244 | setOperationAction(ISD::SSUBSAT , MVT::i32 , Custom); | |||
| 245 | if (Subtarget.is64Bit()) | |||
| 246 | setOperationAction(ISD::SSUBSAT , MVT::i64 , Custom); | |||
| 247 | ||||
| 248 | // Funnel shifts. | |||
| 249 | for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) { | |||
| 250 | // For slow shld targets we only lower for code size. | |||
| 251 | LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal; | |||
| 252 | ||||
| 253 | setOperationAction(ShiftOp , MVT::i8 , Custom); | |||
| 254 | setOperationAction(ShiftOp , MVT::i16 , Custom); | |||
| 255 | setOperationAction(ShiftOp , MVT::i32 , ShiftDoubleAction); | |||
| 256 | if (Subtarget.is64Bit()) | |||
| 257 | setOperationAction(ShiftOp , MVT::i64 , ShiftDoubleAction); | |||
| 258 | } | |||
| 259 | ||||
| 260 | if (!Subtarget.useSoftFloat()) { | |||
| 261 | // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this | |||
| 262 | // operation. | |||
| 263 | setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote); | |||
| 264 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote); | |||
| 265 | setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote); | |||
| 266 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote); | |||
| 267 | // We have an algorithm for SSE2, and we turn this into a 64-bit | |||
| 268 | // FILD or VCVTUSI2SS/SD for other targets. | |||
| 269 | setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom); | |||
| 270 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom); | |||
| 271 | // We have an algorithm for SSE2->double, and we turn this into a | |||
| 272 | // 64-bit FILD followed by conditional FADD for other targets. | |||
| 273 | setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom); | |||
| 274 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom); | |||
| 275 | ||||
| 276 | // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have | |||
| 277 | // this operation. | |||
| 278 | setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote); | |||
| 279 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote); | |||
| 280 | // SSE has no i16 to fp conversion, only i32. We promote in the handler | |||
| 281 | // to allow f80 to use i16 and f64 to use i16 with sse1 only | |||
| 282 | setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom); | |||
| 283 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom); | |||
| 284 | // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not | |||
| 285 | setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom); | |||
| 286 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom); | |||
| 287 | // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 | |||
| 288 | // are Legal, f80 is custom lowered. | |||
| 289 | setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom); | |||
| 290 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom); | |||
| 291 | ||||
| 292 | // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have | |||
| 293 | // this operation. | |||
| 294 | setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote); | |||
| 295 | // FIXME: This doesn't generate invalid exception when it should. PR44019. | |||
| 296 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote); | |||
| 297 | setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom); | |||
| 298 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom); | |||
| 299 | setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom); | |||
| 300 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom); | |||
| 301 | // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 | |||
| 302 | // are Legal, f80 is custom lowered. | |||
| 303 | setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom); | |||
| 304 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom); | |||
| 305 | ||||
| 306 | // Handle FP_TO_UINT by promoting the destination to a larger signed | |||
| 307 | // conversion. | |||
| 308 | setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote); | |||
| 309 | // FIXME: This doesn't generate invalid exception when it should. PR44019. | |||
| 310 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote); | |||
| 311 | setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote); | |||
| 312 | // FIXME: This doesn't generate invalid exception when it should. PR44019. | |||
| 313 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote); | |||
| 314 | setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom); | |||
| 315 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom); | |||
| 316 | setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom); | |||
| 317 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom); | |||
| 318 | ||||
| 319 | setOperationAction(ISD::LRINT, MVT::f32, Custom); | |||
| 320 | setOperationAction(ISD::LRINT, MVT::f64, Custom); | |||
| 321 | setOperationAction(ISD::LLRINT, MVT::f32, Custom); | |||
| 322 | setOperationAction(ISD::LLRINT, MVT::f64, Custom); | |||
| 323 | ||||
| 324 | if (!Subtarget.is64Bit()) { | |||
| 325 | setOperationAction(ISD::LRINT, MVT::i64, Custom); | |||
| 326 | setOperationAction(ISD::LLRINT, MVT::i64, Custom); | |||
| 327 | } | |||
| 328 | } | |||
| 329 | ||||
| 330 | if (Subtarget.hasSSE2()) { | |||
| 331 | // Custom lowering for saturating float to int conversions. | |||
| 332 | // We handle promotion to larger result types manually. | |||
| 333 | for (MVT VT : { MVT::i8, MVT::i16, MVT::i32 }) { | |||
| 334 | setOperationAction(ISD::FP_TO_UINT_SAT, VT, Custom); | |||
| 335 | setOperationAction(ISD::FP_TO_SINT_SAT, VT, Custom); | |||
| 336 | } | |||
| 337 | if (Subtarget.is64Bit()) { | |||
| 338 | setOperationAction(ISD::FP_TO_UINT_SAT, MVT::i64, Custom); | |||
| 339 | setOperationAction(ISD::FP_TO_SINT_SAT, MVT::i64, Custom); | |||
| 340 | } | |||
| 341 | } | |||
| 342 | ||||
| 343 | // Handle address space casts between mixed sized pointers. | |||
| 344 | setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom); | |||
| 345 | setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom); | |||
| 346 | ||||
| 347 | // TODO: when we have SSE, these could be more efficient, by using movd/movq. | |||
| 348 | if (!Subtarget.hasSSE2()) { | |||
| 349 | setOperationAction(ISD::BITCAST , MVT::f32 , Expand); | |||
| 350 | setOperationAction(ISD::BITCAST , MVT::i32 , Expand); | |||
| 351 | if (Subtarget.is64Bit()) { | |||
| 352 | setOperationAction(ISD::BITCAST , MVT::f64 , Expand); | |||
| 353 | // Without SSE, i64->f64 goes through memory. | |||
| 354 | setOperationAction(ISD::BITCAST , MVT::i64 , Expand); | |||
| 355 | } | |||
| 356 | } else if (!Subtarget.is64Bit()) | |||
| 357 | setOperationAction(ISD::BITCAST , MVT::i64 , Custom); | |||
| 358 | ||||
| 359 | // Scalar integer divide and remainder are lowered to use operations that | |||
| 360 | // produce two results, to match the available instructions. This exposes | |||
| 361 | // the two-result form to trivial CSE, which is able to combine x/y and x%y | |||
| 362 | // into a single instruction. | |||
| 363 | // | |||
| 364 | // Scalar integer multiply-high is also lowered to use two-result | |||
| 365 | // operations, to match the available instructions. However, plain multiply | |||
| 366 | // (low) operations are left as Legal, as there are single-result | |||
| 367 | // instructions for this in x86. Using the two-result multiply instructions | |||
| 368 | // when both high and low results are needed must be arranged by dagcombine. | |||
| 369 | for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { | |||
| 370 | setOperationAction(ISD::MULHS, VT, Expand); | |||
| 371 | setOperationAction(ISD::MULHU, VT, Expand); | |||
| 372 | setOperationAction(ISD::SDIV, VT, Expand); | |||
| 373 | setOperationAction(ISD::UDIV, VT, Expand); | |||
| 374 | setOperationAction(ISD::SREM, VT, Expand); | |||
| 375 | setOperationAction(ISD::UREM, VT, Expand); | |||
| 376 | } | |||
| 377 | ||||
| 378 | setOperationAction(ISD::BR_JT , MVT::Other, Expand); | |||
| 379 | setOperationAction(ISD::BRCOND , MVT::Other, Custom); | |||
| 380 | for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128, | |||
| 381 | MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { | |||
| 382 | setOperationAction(ISD::BR_CC, VT, Expand); | |||
| 383 | setOperationAction(ISD::SELECT_CC, VT, Expand); | |||
| 384 | } | |||
| 385 | if (Subtarget.is64Bit()) | |||
| 386 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); | |||
| 387 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); | |||
| 388 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); | |||
| 389 | setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); | |||
| 390 | ||||
| 391 | setOperationAction(ISD::FREM , MVT::f32 , Expand); | |||
| 392 | setOperationAction(ISD::FREM , MVT::f64 , Expand); | |||
| 393 | setOperationAction(ISD::FREM , MVT::f80 , Expand); | |||
| 394 | setOperationAction(ISD::FREM , MVT::f128 , Expand); | |||
| 395 | ||||
| 396 | if (!Subtarget.useSoftFloat() && Subtarget.hasX87()) { | |||
| 397 | setOperationAction(ISD::GET_ROUNDING , MVT::i32 , Custom); | |||
| 398 | setOperationAction(ISD::SET_ROUNDING , MVT::Other, Custom); | |||
| 399 | } | |||
| 400 | ||||
| 401 | // Promote the i8 variants and force them on up to i32 which has a shorter | |||
| 402 | // encoding. | |||
| 403 | setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32); | |||
| 404 | setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32); | |||
| 405 | // Promoted i16. tzcntw has a false dependency on Intel CPUs. For BSF, we emit | |||
| 406 | // a REP prefix to encode it as TZCNT for modern CPUs so it makes sense to | |||
| 407 | // promote that too. | |||
| 408 | setOperationPromotedToType(ISD::CTTZ , MVT::i16 , MVT::i32); | |||
| 409 | setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , MVT::i32); | |||
| 410 | ||||
| 411 | if (!Subtarget.hasBMI()) { | |||
| 412 | setOperationAction(ISD::CTTZ , MVT::i32 , Custom); | |||
| 413 | setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal); | |||
| 414 | if (Subtarget.is64Bit()) { | |||
| 415 | setOperationAction(ISD::CTTZ , MVT::i64 , Custom); | |||
| 416 | setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal); | |||
| 417 | } | |||
| 418 | } | |||
| 419 | ||||
| 420 | if (Subtarget.hasLZCNT()) { | |||
| 421 | // When promoting the i8 variants, force them to i32 for a shorter | |||
| 422 | // encoding. | |||
| 423 | setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32); | |||
| 424 | setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); | |||
| 425 | } else { | |||
| 426 | for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) { | |||
| 427 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 428 | continue; | |||
| 429 | setOperationAction(ISD::CTLZ , VT, Custom); | |||
| 430 | setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom); | |||
| 431 | } | |||
| 432 | } | |||
| 433 | ||||
| 434 | for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16, | |||
| 435 | ISD::STRICT_FP_TO_FP16}) { | |||
| 436 | // Special handling for half-precision floating point conversions. | |||
| 437 | // If we don't have F16C support, then lower half float conversions | |||
| 438 | // into library calls. | |||
| 439 | setOperationAction( | |||
| 440 | Op, MVT::f32, | |||
| 441 | (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand); | |||
| 442 | // There's never any support for operations beyond MVT::f32. | |||
| 443 | setOperationAction(Op, MVT::f64, Expand); | |||
| 444 | setOperationAction(Op, MVT::f80, Expand); | |||
| 445 | setOperationAction(Op, MVT::f128, Expand); | |||
| 446 | } | |||
| 447 | ||||
| 448 | for (MVT VT : {MVT::f32, MVT::f64, MVT::f80, MVT::f128}) { | |||
| 449 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand); | |||
| 450 | setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand); | |||
| 451 | setTruncStoreAction(VT, MVT::f16, Expand); | |||
| 452 | setTruncStoreAction(VT, MVT::bf16, Expand); | |||
| 453 | ||||
| 454 | setOperationAction(ISD::BF16_TO_FP, VT, Expand); | |||
| 455 | setOperationAction(ISD::FP_TO_BF16, VT, Custom); | |||
| 456 | } | |||
| 457 | ||||
| 458 | setOperationAction(ISD::PARITY, MVT::i8, Custom); | |||
| 459 | setOperationAction(ISD::PARITY, MVT::i16, Custom); | |||
| 460 | setOperationAction(ISD::PARITY, MVT::i32, Custom); | |||
| 461 | if (Subtarget.is64Bit()) | |||
| 462 | setOperationAction(ISD::PARITY, MVT::i64, Custom); | |||
| 463 | if (Subtarget.hasPOPCNT()) { | |||
| 464 | setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); | |||
| 465 | // popcntw is longer to encode than popcntl and also has a false dependency | |||
| 466 | // on the dest that popcntl hasn't had since Cannon Lake. | |||
| 467 | setOperationPromotedToType(ISD::CTPOP, MVT::i16, MVT::i32); | |||
| 468 | } else { | |||
| 469 | setOperationAction(ISD::CTPOP , MVT::i8 , Expand); | |||
| 470 | setOperationAction(ISD::CTPOP , MVT::i16 , Expand); | |||
| 471 | setOperationAction(ISD::CTPOP , MVT::i32 , Expand); | |||
| 472 | if (Subtarget.is64Bit()) | |||
| 473 | setOperationAction(ISD::CTPOP , MVT::i64 , Expand); | |||
| 474 | else | |||
| 475 | setOperationAction(ISD::CTPOP , MVT::i64 , Custom); | |||
| 476 | } | |||
| 477 | ||||
| 478 | setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); | |||
| 479 | ||||
| 480 | if (!Subtarget.hasMOVBE()) | |||
| 481 | setOperationAction(ISD::BSWAP , MVT::i16 , Expand); | |||
| 482 | ||||
| 483 | // X86 wants to expand cmov itself. | |||
| 484 | for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) { | |||
| 485 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 486 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 487 | setOperationAction(ISD::STRICT_FSETCC, VT, Custom); | |||
| 488 | setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); | |||
| 489 | } | |||
| 490 | for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { | |||
| 491 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 492 | continue; | |||
| 493 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 494 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 495 | } | |||
| 496 | ||||
| 497 | // Custom action for SELECT MMX and expand action for SELECT_CC MMX | |||
| 498 | setOperationAction(ISD::SELECT, MVT::x86mmx, Custom); | |||
| 499 | setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); | |||
| 500 | ||||
| 501 | setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); | |||
| 502 | // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since | |||
| 503 | // LLVM/Clang supports zero-cost DWARF and SEH exception handling. | |||
| 504 | setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); | |||
| 505 | setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); | |||
| 506 | setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); | |||
| 507 | if (TM.Options.ExceptionModel == ExceptionHandling::SjLj) | |||
| 508 | setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume"); | |||
| 509 | ||||
| 510 | // Darwin ABI issue. | |||
| 511 | for (auto VT : { MVT::i32, MVT::i64 }) { | |||
| 512 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 513 | continue; | |||
| 514 | setOperationAction(ISD::ConstantPool , VT, Custom); | |||
| 515 | setOperationAction(ISD::JumpTable , VT, Custom); | |||
| 516 | setOperationAction(ISD::GlobalAddress , VT, Custom); | |||
| 517 | setOperationAction(ISD::GlobalTLSAddress, VT, Custom); | |||
| 518 | setOperationAction(ISD::ExternalSymbol , VT, Custom); | |||
| 519 | setOperationAction(ISD::BlockAddress , VT, Custom); | |||
| 520 | } | |||
| 521 | ||||
| 522 | // 64-bit shl, sra, srl (iff 32-bit x86) | |||
| 523 | for (auto VT : { MVT::i32, MVT::i64 }) { | |||
| 524 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 525 | continue; | |||
| 526 | setOperationAction(ISD::SHL_PARTS, VT, Custom); | |||
| 527 | setOperationAction(ISD::SRA_PARTS, VT, Custom); | |||
| 528 | setOperationAction(ISD::SRL_PARTS, VT, Custom); | |||
| 529 | } | |||
| 530 | ||||
| 531 | if (Subtarget.hasSSEPrefetch() || Subtarget.hasThreeDNow()) | |||
| 532 | setOperationAction(ISD::PREFETCH , MVT::Other, Legal); | |||
| 533 | ||||
| 534 | setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); | |||
| 535 | ||||
| 536 | // Expand certain atomics | |||
| 537 | for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { | |||
| 538 | setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); | |||
| 539 | setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); | |||
| 540 | setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom); | |||
| 541 | setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom); | |||
| 542 | setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom); | |||
| 543 | setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom); | |||
| 544 | setOperationAction(ISD::ATOMIC_STORE, VT, Custom); | |||
| 545 | } | |||
| 546 | ||||
| 547 | if (!Subtarget.is64Bit()) | |||
| 548 | setOperationAction(ISD::ATOMIC_LOAD, MVT::i64, Custom); | |||
| 549 | ||||
| 550 | if (Subtarget.canUseCMPXCHG16B()) | |||
| 551 | setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); | |||
| 552 | ||||
| 553 | // FIXME - use subtarget debug flags | |||
| 554 | if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() && | |||
| 555 | !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() && | |||
| 556 | TM.Options.ExceptionModel != ExceptionHandling::SjLj) { | |||
| 557 | setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); | |||
| 558 | } | |||
| 559 | ||||
| 560 | setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); | |||
| 561 | setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); | |||
| 562 | ||||
| 563 | setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); | |||
| 564 | setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); | |||
| 565 | ||||
| 566 | setOperationAction(ISD::TRAP, MVT::Other, Legal); | |||
| 567 | setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); | |||
| 568 | if (Subtarget.isTargetPS()) | |||
| 569 | setOperationAction(ISD::UBSANTRAP, MVT::Other, Expand); | |||
| 570 | else | |||
| 571 | setOperationAction(ISD::UBSANTRAP, MVT::Other, Legal); | |||
| 572 | ||||
| 573 | // VASTART needs to be custom lowered to use the VarArgsFrameIndex | |||
| 574 | setOperationAction(ISD::VASTART , MVT::Other, Custom); | |||
| 575 | setOperationAction(ISD::VAEND , MVT::Other, Expand); | |||
| 576 | bool Is64Bit = Subtarget.is64Bit(); | |||
| 577 | setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand); | |||
| 578 | setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand); | |||
| 579 | ||||
| 580 | setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); | |||
| 581 | setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); | |||
| 582 | ||||
| 583 | setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom); | |||
| 584 | ||||
| 585 | // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering. | |||
| 586 | setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom); | |||
| 587 | setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom); | |||
| 588 | ||||
| 589 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal); | |||
| 590 | ||||
| 591 | auto setF16Action = [&] (MVT VT, LegalizeAction Action) { | |||
| 592 | setOperationAction(ISD::FABS, VT, Action); | |||
| 593 | setOperationAction(ISD::FNEG, VT, Action); | |||
| 594 | setOperationAction(ISD::FCOPYSIGN, VT, Expand); | |||
| 595 | setOperationAction(ISD::FREM, VT, Action); | |||
| 596 | setOperationAction(ISD::FMA, VT, Action); | |||
| 597 | setOperationAction(ISD::FMINNUM, VT, Action); | |||
| 598 | setOperationAction(ISD::FMAXNUM, VT, Action); | |||
| 599 | setOperationAction(ISD::FMINIMUM, VT, Action); | |||
| 600 | setOperationAction(ISD::FMAXIMUM, VT, Action); | |||
| 601 | setOperationAction(ISD::FSIN, VT, Action); | |||
| 602 | setOperationAction(ISD::FCOS, VT, Action); | |||
| 603 | setOperationAction(ISD::FSINCOS, VT, Action); | |||
| 604 | setOperationAction(ISD::FSQRT, VT, Action); | |||
| 605 | setOperationAction(ISD::FPOW, VT, Action); | |||
| 606 | setOperationAction(ISD::FLOG, VT, Action); | |||
| 607 | setOperationAction(ISD::FLOG2, VT, Action); | |||
| 608 | setOperationAction(ISD::FLOG10, VT, Action); | |||
| 609 | setOperationAction(ISD::FEXP, VT, Action); | |||
| 610 | setOperationAction(ISD::FEXP2, VT, Action); | |||
| 611 | setOperationAction(ISD::FCEIL, VT, Action); | |||
| 612 | setOperationAction(ISD::FFLOOR, VT, Action); | |||
| 613 | setOperationAction(ISD::FNEARBYINT, VT, Action); | |||
| 614 | setOperationAction(ISD::FRINT, VT, Action); | |||
| 615 | setOperationAction(ISD::BR_CC, VT, Action); | |||
| 616 | setOperationAction(ISD::SETCC, VT, Action); | |||
| 617 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 618 | setOperationAction(ISD::SELECT_CC, VT, Action); | |||
| 619 | setOperationAction(ISD::FROUND, VT, Action); | |||
| 620 | setOperationAction(ISD::FROUNDEVEN, VT, Action); | |||
| 621 | setOperationAction(ISD::FTRUNC, VT, Action); | |||
| 622 | }; | |||
| 623 | ||||
| 624 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { | |||
| 625 | // f16, f32 and f64 use SSE. | |||
| 626 | // Set up the FP register classes. | |||
| 627 | addRegisterClass(MVT::f16, Subtarget.hasAVX512() ? &X86::FR16XRegClass | |||
| 628 | : &X86::FR16RegClass); | |||
| 629 | addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass | |||
| 630 | : &X86::FR32RegClass); | |||
| 631 | addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass | |||
| 632 | : &X86::FR64RegClass); | |||
| 633 | ||||
| 634 | // Disable f32->f64 extload as we can only generate this in one instruction | |||
| 635 | // under optsize. So its easier to pattern match (fpext (load)) for that | |||
| 636 | // case instead of needing to emit 2 instructions for extload in the | |||
| 637 | // non-optsize case. | |||
| 638 | setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand); | |||
| 639 | ||||
| 640 | for (auto VT : { MVT::f32, MVT::f64 }) { | |||
| 641 | // Use ANDPD to simulate FABS. | |||
| 642 | setOperationAction(ISD::FABS, VT, Custom); | |||
| 643 | ||||
| 644 | // Use XORP to simulate FNEG. | |||
| 645 | setOperationAction(ISD::FNEG, VT, Custom); | |||
| 646 | ||||
| 647 | // Use ANDPD and ORPD to simulate FCOPYSIGN. | |||
| 648 | setOperationAction(ISD::FCOPYSIGN, VT, Custom); | |||
| 649 | ||||
| 650 | // These might be better off as horizontal vector ops. | |||
| 651 | setOperationAction(ISD::FADD, VT, Custom); | |||
| 652 | setOperationAction(ISD::FSUB, VT, Custom); | |||
| 653 | ||||
| 654 | // We don't support sin/cos/fmod | |||
| 655 | setOperationAction(ISD::FSIN , VT, Expand); | |||
| 656 | setOperationAction(ISD::FCOS , VT, Expand); | |||
| 657 | setOperationAction(ISD::FSINCOS, VT, Expand); | |||
| 658 | } | |||
| 659 | ||||
| 660 | // Half type will be promoted by default. | |||
| 661 | setF16Action(MVT::f16, Promote); | |||
| 662 | setOperationAction(ISD::FADD, MVT::f16, Promote); | |||
| 663 | setOperationAction(ISD::FSUB, MVT::f16, Promote); | |||
| 664 | setOperationAction(ISD::FMUL, MVT::f16, Promote); | |||
| 665 | setOperationAction(ISD::FDIV, MVT::f16, Promote); | |||
| 666 | setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); | |||
| 667 | setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); | |||
| 668 | setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); | |||
| 669 | ||||
| 670 | setOperationAction(ISD::STRICT_FADD, MVT::f16, Promote); | |||
| 671 | setOperationAction(ISD::STRICT_FSUB, MVT::f16, Promote); | |||
| 672 | setOperationAction(ISD::STRICT_FMUL, MVT::f16, Promote); | |||
| 673 | setOperationAction(ISD::STRICT_FDIV, MVT::f16, Promote); | |||
| 674 | setOperationAction(ISD::STRICT_FMA, MVT::f16, Promote); | |||
| 675 | setOperationAction(ISD::STRICT_FMINNUM, MVT::f16, Promote); | |||
| 676 | setOperationAction(ISD::STRICT_FMAXNUM, MVT::f16, Promote); | |||
| 677 | setOperationAction(ISD::STRICT_FMINIMUM, MVT::f16, Promote); | |||
| 678 | setOperationAction(ISD::STRICT_FMAXIMUM, MVT::f16, Promote); | |||
| 679 | setOperationAction(ISD::STRICT_FSQRT, MVT::f16, Promote); | |||
| 680 | setOperationAction(ISD::STRICT_FPOW, MVT::f16, Promote); | |||
| 681 | setOperationAction(ISD::STRICT_FLOG, MVT::f16, Promote); | |||
| 682 | setOperationAction(ISD::STRICT_FLOG2, MVT::f16, Promote); | |||
| 683 | setOperationAction(ISD::STRICT_FLOG10, MVT::f16, Promote); | |||
| 684 | setOperationAction(ISD::STRICT_FEXP, MVT::f16, Promote); | |||
| 685 | setOperationAction(ISD::STRICT_FEXP2, MVT::f16, Promote); | |||
| 686 | setOperationAction(ISD::STRICT_FCEIL, MVT::f16, Promote); | |||
| 687 | setOperationAction(ISD::STRICT_FFLOOR, MVT::f16, Promote); | |||
| 688 | setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f16, Promote); | |||
| 689 | setOperationAction(ISD::STRICT_FRINT, MVT::f16, Promote); | |||
| 690 | setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Promote); | |||
| 691 | setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Promote); | |||
| 692 | setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); | |||
| 693 | setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Promote); | |||
| 694 | setOperationAction(ISD::STRICT_FTRUNC, MVT::f16, Promote); | |||
| 695 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); | |||
| 696 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); | |||
| 697 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); | |||
| 698 | ||||
| 699 | setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); | |||
| 700 | setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); | |||
| 701 | ||||
| 702 | // Lower this to MOVMSK plus an AND. | |||
| 703 | setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); | |||
| 704 | setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); | |||
| 705 | ||||
| 706 | } else if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1() && | |||
| 707 | (UseX87 || Is64Bit)) { | |||
| 708 | // Use SSE for f32, x87 for f64. | |||
| 709 | // Set up the FP register classes. | |||
| 710 | addRegisterClass(MVT::f32, &X86::FR32RegClass); | |||
| 711 | if (UseX87) | |||
| 712 | addRegisterClass(MVT::f64, &X86::RFP64RegClass); | |||
| 713 | ||||
| 714 | // Use ANDPS to simulate FABS. | |||
| 715 | setOperationAction(ISD::FABS , MVT::f32, Custom); | |||
| 716 | ||||
| 717 | // Use XORP to simulate FNEG. | |||
| 718 | setOperationAction(ISD::FNEG , MVT::f32, Custom); | |||
| 719 | ||||
| 720 | if (UseX87) | |||
| 721 | setOperationAction(ISD::UNDEF, MVT::f64, Expand); | |||
| 722 | ||||
| 723 | // Use ANDPS and ORPS to simulate FCOPYSIGN. | |||
| 724 | if (UseX87) | |||
| 725 | setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); | |||
| 726 | setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); | |||
| 727 | ||||
| 728 | // We don't support sin/cos/fmod | |||
| 729 | setOperationAction(ISD::FSIN , MVT::f32, Expand); | |||
| 730 | setOperationAction(ISD::FCOS , MVT::f32, Expand); | |||
| 731 | setOperationAction(ISD::FSINCOS, MVT::f32, Expand); | |||
| 732 | ||||
| 733 | if (UseX87) { | |||
| 734 | // Always expand sin/cos functions even though x87 has an instruction. | |||
| 735 | setOperationAction(ISD::FSIN, MVT::f64, Expand); | |||
| 736 | setOperationAction(ISD::FCOS, MVT::f64, Expand); | |||
| 737 | setOperationAction(ISD::FSINCOS, MVT::f64, Expand); | |||
| 738 | } | |||
| 739 | } else if (UseX87) { | |||
| 740 | // f32 and f64 in x87. | |||
| 741 | // Set up the FP register classes. | |||
| 742 | addRegisterClass(MVT::f64, &X86::RFP64RegClass); | |||
| 743 | addRegisterClass(MVT::f32, &X86::RFP32RegClass); | |||
| 744 | ||||
| 745 | for (auto VT : { MVT::f32, MVT::f64 }) { | |||
| 746 | setOperationAction(ISD::UNDEF, VT, Expand); | |||
| 747 | setOperationAction(ISD::FCOPYSIGN, VT, Expand); | |||
| 748 | ||||
| 749 | // Always expand sin/cos functions even though x87 has an instruction. | |||
| 750 | setOperationAction(ISD::FSIN , VT, Expand); | |||
| 751 | setOperationAction(ISD::FCOS , VT, Expand); | |||
| 752 | setOperationAction(ISD::FSINCOS, VT, Expand); | |||
| 753 | } | |||
| 754 | } | |||
| 755 | ||||
| 756 | // Expand FP32 immediates into loads from the stack, save special cases. | |||
| 757 | if (isTypeLegal(MVT::f32)) { | |||
| 758 | if (UseX87 && (getRegClassFor(MVT::f32) == &X86::RFP32RegClass)) { | |||
| 759 | addLegalFPImmediate(APFloat(+0.0f)); // FLD0 | |||
| 760 | addLegalFPImmediate(APFloat(+1.0f)); // FLD1 | |||
| 761 | addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS | |||
| 762 | addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS | |||
| 763 | } else // SSE immediates. | |||
| 764 | addLegalFPImmediate(APFloat(+0.0f)); // xorps | |||
| 765 | } | |||
| 766 | // Expand FP64 immediates into loads from the stack, save special cases. | |||
| 767 | if (isTypeLegal(MVT::f64)) { | |||
| 768 | if (UseX87 && getRegClassFor(MVT::f64) == &X86::RFP64RegClass) { | |||
| 769 | addLegalFPImmediate(APFloat(+0.0)); // FLD0 | |||
| 770 | addLegalFPImmediate(APFloat(+1.0)); // FLD1 | |||
| 771 | addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS | |||
| 772 | addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS | |||
| 773 | } else // SSE immediates. | |||
| 774 | addLegalFPImmediate(APFloat(+0.0)); // xorpd | |||
| 775 | } | |||
| 776 | // Support fp16 0 immediate. | |||
| 777 | if (isTypeLegal(MVT::f16)) | |||
| 778 | addLegalFPImmediate(APFloat::getZero(APFloat::IEEEhalf())); | |||
| 779 | ||||
| 780 | // Handle constrained floating-point operations of scalar. | |||
| 781 | setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal); | |||
| 782 | setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal); | |||
| 783 | setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal); | |||
| 784 | setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal); | |||
| 785 | setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal); | |||
| 786 | setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal); | |||
| 787 | setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal); | |||
| 788 | setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal); | |||
| 789 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal); | |||
| 790 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal); | |||
| 791 | setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal); | |||
| 792 | setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal); | |||
| 793 | ||||
| 794 | // We don't support FMA. | |||
| 795 | setOperationAction(ISD::FMA, MVT::f64, Expand); | |||
| 796 | setOperationAction(ISD::FMA, MVT::f32, Expand); | |||
| 797 | ||||
| 798 | // f80 always uses X87. | |||
| 799 | if (UseX87) { | |||
| 800 | addRegisterClass(MVT::f80, &X86::RFP80RegClass); | |||
| 801 | setOperationAction(ISD::UNDEF, MVT::f80, Expand); | |||
| 802 | setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); | |||
| 803 | { | |||
| 804 | APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended()); | |||
| 805 | addLegalFPImmediate(TmpFlt); // FLD0 | |||
| 806 | TmpFlt.changeSign(); | |||
| 807 | addLegalFPImmediate(TmpFlt); // FLD0/FCHS | |||
| 808 | ||||
| 809 | bool ignored; | |||
| 810 | APFloat TmpFlt2(+1.0); | |||
| 811 | TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven, | |||
| 812 | &ignored); | |||
| 813 | addLegalFPImmediate(TmpFlt2); // FLD1 | |||
| 814 | TmpFlt2.changeSign(); | |||
| 815 | addLegalFPImmediate(TmpFlt2); // FLD1/FCHS | |||
| 816 | } | |||
| 817 | ||||
| 818 | // Always expand sin/cos functions even though x87 has an instruction. | |||
| 819 | setOperationAction(ISD::FSIN , MVT::f80, Expand); | |||
| 820 | setOperationAction(ISD::FCOS , MVT::f80, Expand); | |||
| 821 | setOperationAction(ISD::FSINCOS, MVT::f80, Expand); | |||
| 822 | ||||
| 823 | setOperationAction(ISD::FFLOOR, MVT::f80, Expand); | |||
| 824 | setOperationAction(ISD::FCEIL, MVT::f80, Expand); | |||
| 825 | setOperationAction(ISD::FTRUNC, MVT::f80, Expand); | |||
| 826 | setOperationAction(ISD::FRINT, MVT::f80, Expand); | |||
| 827 | setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); | |||
| 828 | setOperationAction(ISD::FMA, MVT::f80, Expand); | |||
| 829 | setOperationAction(ISD::LROUND, MVT::f80, Expand); | |||
| 830 | setOperationAction(ISD::LLROUND, MVT::f80, Expand); | |||
| 831 | setOperationAction(ISD::LRINT, MVT::f80, Custom); | |||
| 832 | setOperationAction(ISD::LLRINT, MVT::f80, Custom); | |||
| 833 | ||||
| 834 | // Handle constrained floating-point operations of scalar. | |||
| 835 | setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal); | |||
| 836 | setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal); | |||
| 837 | setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal); | |||
| 838 | setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal); | |||
| 839 | setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal); | |||
| 840 | if (isTypeLegal(MVT::f16)) { | |||
| 841 | setOperationAction(ISD::FP_EXTEND, MVT::f80, Custom); | |||
| 842 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Custom); | |||
| 843 | } else { | |||
| 844 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal); | |||
| 845 | } | |||
| 846 | // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten | |||
| 847 | // as Custom. | |||
| 848 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal); | |||
| 849 | } | |||
| 850 | ||||
| 851 | // f128 uses xmm registers, but most operations require libcalls. | |||
| 852 | if (!Subtarget.useSoftFloat() && Subtarget.is64Bit() && Subtarget.hasSSE1()) { | |||
| 853 | addRegisterClass(MVT::f128, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 854 | : &X86::VR128RegClass); | |||
| 855 | ||||
| 856 | addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps | |||
| 857 | ||||
| 858 | setOperationAction(ISD::FADD, MVT::f128, LibCall); | |||
| 859 | setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall); | |||
| 860 | setOperationAction(ISD::FSUB, MVT::f128, LibCall); | |||
| 861 | setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall); | |||
| 862 | setOperationAction(ISD::FDIV, MVT::f128, LibCall); | |||
| 863 | setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall); | |||
| 864 | setOperationAction(ISD::FMUL, MVT::f128, LibCall); | |||
| 865 | setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall); | |||
| 866 | setOperationAction(ISD::FMA, MVT::f128, LibCall); | |||
| 867 | setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall); | |||
| 868 | ||||
| 869 | setOperationAction(ISD::FABS, MVT::f128, Custom); | |||
| 870 | setOperationAction(ISD::FNEG, MVT::f128, Custom); | |||
| 871 | setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom); | |||
| 872 | ||||
| 873 | setOperationAction(ISD::FSIN, MVT::f128, LibCall); | |||
| 874 | setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall); | |||
| 875 | setOperationAction(ISD::FCOS, MVT::f128, LibCall); | |||
| 876 | setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall); | |||
| 877 | setOperationAction(ISD::FSINCOS, MVT::f128, LibCall); | |||
| 878 | // No STRICT_FSINCOS | |||
| 879 | setOperationAction(ISD::FSQRT, MVT::f128, LibCall); | |||
| 880 | setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall); | |||
| 881 | ||||
| 882 | setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom); | |||
| 883 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom); | |||
| 884 | // We need to custom handle any FP_ROUND with an f128 input, but | |||
| 885 | // LegalizeDAG uses the result type to know when to run a custom handler. | |||
| 886 | // So we have to list all legal floating point result types here. | |||
| 887 | if (isTypeLegal(MVT::f32)) { | |||
| 888 | setOperationAction(ISD::FP_ROUND, MVT::f32, Custom); | |||
| 889 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom); | |||
| 890 | } | |||
| 891 | if (isTypeLegal(MVT::f64)) { | |||
| 892 | setOperationAction(ISD::FP_ROUND, MVT::f64, Custom); | |||
| 893 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom); | |||
| 894 | } | |||
| 895 | if (isTypeLegal(MVT::f80)) { | |||
| 896 | setOperationAction(ISD::FP_ROUND, MVT::f80, Custom); | |||
| 897 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Custom); | |||
| 898 | } | |||
| 899 | ||||
| 900 | setOperationAction(ISD::SETCC, MVT::f128, Custom); | |||
| 901 | ||||
| 902 | setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); | |||
| 903 | setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); | |||
| 904 | setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f80, Expand); | |||
| 905 | setTruncStoreAction(MVT::f128, MVT::f32, Expand); | |||
| 906 | setTruncStoreAction(MVT::f128, MVT::f64, Expand); | |||
| 907 | setTruncStoreAction(MVT::f128, MVT::f80, Expand); | |||
| 908 | } | |||
| 909 | ||||
| 910 | // Always use a library call for pow. | |||
| 911 | setOperationAction(ISD::FPOW , MVT::f32 , Expand); | |||
| 912 | setOperationAction(ISD::FPOW , MVT::f64 , Expand); | |||
| 913 | setOperationAction(ISD::FPOW , MVT::f80 , Expand); | |||
| 914 | setOperationAction(ISD::FPOW , MVT::f128 , Expand); | |||
| 915 | ||||
| 916 | setOperationAction(ISD::FLOG, MVT::f80, Expand); | |||
| 917 | setOperationAction(ISD::FLOG2, MVT::f80, Expand); | |||
| 918 | setOperationAction(ISD::FLOG10, MVT::f80, Expand); | |||
| 919 | setOperationAction(ISD::FEXP, MVT::f80, Expand); | |||
| 920 | setOperationAction(ISD::FEXP2, MVT::f80, Expand); | |||
| 921 | setOperationAction(ISD::FMINNUM, MVT::f80, Expand); | |||
| 922 | setOperationAction(ISD::FMAXNUM, MVT::f80, Expand); | |||
| 923 | ||||
| 924 | // Some FP actions are always expanded for vector types. | |||
| 925 | for (auto VT : { MVT::v8f16, MVT::v16f16, MVT::v32f16, | |||
| 926 | MVT::v4f32, MVT::v8f32, MVT::v16f32, | |||
| 927 | MVT::v2f64, MVT::v4f64, MVT::v8f64 }) { | |||
| 928 | setOperationAction(ISD::FSIN, VT, Expand); | |||
| 929 | setOperationAction(ISD::FSINCOS, VT, Expand); | |||
| 930 | setOperationAction(ISD::FCOS, VT, Expand); | |||
| 931 | setOperationAction(ISD::FREM, VT, Expand); | |||
| 932 | setOperationAction(ISD::FCOPYSIGN, VT, Expand); | |||
| 933 | setOperationAction(ISD::FPOW, VT, Expand); | |||
| 934 | setOperationAction(ISD::FLOG, VT, Expand); | |||
| 935 | setOperationAction(ISD::FLOG2, VT, Expand); | |||
| 936 | setOperationAction(ISD::FLOG10, VT, Expand); | |||
| 937 | setOperationAction(ISD::FEXP, VT, Expand); | |||
| 938 | setOperationAction(ISD::FEXP2, VT, Expand); | |||
| 939 | } | |||
| 940 | ||||
| 941 | // First set operation action for all vector types to either promote | |||
| 942 | // (for widening) or expand (for scalarization). Then we will selectively | |||
| 943 | // turn on ones that can be effectively codegen'd. | |||
| 944 | for (MVT VT : MVT::fixedlen_vector_valuetypes()) { | |||
| 945 | setOperationAction(ISD::SDIV, VT, Expand); | |||
| 946 | setOperationAction(ISD::UDIV, VT, Expand); | |||
| 947 | setOperationAction(ISD::SREM, VT, Expand); | |||
| 948 | setOperationAction(ISD::UREM, VT, Expand); | |||
| 949 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); | |||
| 950 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); | |||
| 951 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); | |||
| 952 | setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); | |||
| 953 | setOperationAction(ISD::FMA, VT, Expand); | |||
| 954 | setOperationAction(ISD::FFLOOR, VT, Expand); | |||
| 955 | setOperationAction(ISD::FCEIL, VT, Expand); | |||
| 956 | setOperationAction(ISD::FTRUNC, VT, Expand); | |||
| 957 | setOperationAction(ISD::FRINT, VT, Expand); | |||
| 958 | setOperationAction(ISD::FNEARBYINT, VT, Expand); | |||
| 959 | setOperationAction(ISD::SMUL_LOHI, VT, Expand); | |||
| 960 | setOperationAction(ISD::MULHS, VT, Expand); | |||
| 961 | setOperationAction(ISD::UMUL_LOHI, VT, Expand); | |||
| 962 | setOperationAction(ISD::MULHU, VT, Expand); | |||
| 963 | setOperationAction(ISD::SDIVREM, VT, Expand); | |||
| 964 | setOperationAction(ISD::UDIVREM, VT, Expand); | |||
| 965 | setOperationAction(ISD::CTPOP, VT, Expand); | |||
| 966 | setOperationAction(ISD::CTTZ, VT, Expand); | |||
| 967 | setOperationAction(ISD::CTLZ, VT, Expand); | |||
| 968 | setOperationAction(ISD::ROTL, VT, Expand); | |||
| 969 | setOperationAction(ISD::ROTR, VT, Expand); | |||
| 970 | setOperationAction(ISD::BSWAP, VT, Expand); | |||
| 971 | setOperationAction(ISD::SETCC, VT, Expand); | |||
| 972 | setOperationAction(ISD::FP_TO_UINT, VT, Expand); | |||
| 973 | setOperationAction(ISD::FP_TO_SINT, VT, Expand); | |||
| 974 | setOperationAction(ISD::UINT_TO_FP, VT, Expand); | |||
| 975 | setOperationAction(ISD::SINT_TO_FP, VT, Expand); | |||
| 976 | setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); | |||
| 977 | setOperationAction(ISD::TRUNCATE, VT, Expand); | |||
| 978 | setOperationAction(ISD::SIGN_EXTEND, VT, Expand); | |||
| 979 | setOperationAction(ISD::ZERO_EXTEND, VT, Expand); | |||
| 980 | setOperationAction(ISD::ANY_EXTEND, VT, Expand); | |||
| 981 | setOperationAction(ISD::SELECT_CC, VT, Expand); | |||
| 982 | for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) { | |||
| 983 | setTruncStoreAction(InnerVT, VT, Expand); | |||
| 984 | ||||
| 985 | setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand); | |||
| 986 | setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand); | |||
| 987 | ||||
| 988 | // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like | |||
| 989 | // types, we have to deal with them whether we ask for Expansion or not. | |||
| 990 | // Setting Expand causes its own optimisation problems though, so leave | |||
| 991 | // them legal. | |||
| 992 | if (VT.getVectorElementType() == MVT::i1) | |||
| 993 | setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); | |||
| 994 | ||||
| 995 | // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are | |||
| 996 | // split/scalarized right now. | |||
| 997 | if (VT.getVectorElementType() == MVT::f16 || | |||
| 998 | VT.getVectorElementType() == MVT::bf16) | |||
| 999 | setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand); | |||
| 1000 | } | |||
| 1001 | } | |||
| 1002 | ||||
| 1003 | // FIXME: In order to prevent SSE instructions being expanded to MMX ones | |||
| 1004 | // with -msoft-float, disable use of MMX as well. | |||
| 1005 | if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) { | |||
| 1006 | addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); | |||
| 1007 | // No operations on x86mmx supported, everything uses intrinsics. | |||
| 1008 | } | |||
| 1009 | ||||
| 1010 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) { | |||
| 1011 | addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1012 | : &X86::VR128RegClass); | |||
| 1013 | ||||
| 1014 | setOperationAction(ISD::FMAXIMUM, MVT::f32, Custom); | |||
| 1015 | setOperationAction(ISD::FMINIMUM, MVT::f32, Custom); | |||
| 1016 | ||||
| 1017 | setOperationAction(ISD::FNEG, MVT::v4f32, Custom); | |||
| 1018 | setOperationAction(ISD::FABS, MVT::v4f32, Custom); | |||
| 1019 | setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom); | |||
| 1020 | setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); | |||
| 1021 | setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); | |||
| 1022 | setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); | |||
| 1023 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); | |||
| 1024 | setOperationAction(ISD::SELECT, MVT::v4f32, Custom); | |||
| 1025 | ||||
| 1026 | setOperationAction(ISD::LOAD, MVT::v2f32, Custom); | |||
| 1027 | setOperationAction(ISD::STORE, MVT::v2f32, Custom); | |||
| 1028 | ||||
| 1029 | setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal); | |||
| 1030 | setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal); | |||
| 1031 | setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal); | |||
| 1032 | setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal); | |||
| 1033 | setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal); | |||
| 1034 | } | |||
| 1035 | ||||
| 1036 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) { | |||
| 1037 | addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1038 | : &X86::VR128RegClass); | |||
| 1039 | ||||
| 1040 | // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM | |||
| 1041 | // registers cannot be used even for integer operations. | |||
| 1042 | addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1043 | : &X86::VR128RegClass); | |||
| 1044 | addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1045 | : &X86::VR128RegClass); | |||
| 1046 | addRegisterClass(MVT::v8f16, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1047 | : &X86::VR128RegClass); | |||
| 1048 | addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1049 | : &X86::VR128RegClass); | |||
| 1050 | addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass | |||
| 1051 | : &X86::VR128RegClass); | |||
| 1052 | ||||
| 1053 | setOperationAction(ISD::FMAXIMUM, MVT::f64, Custom); | |||
| 1054 | setOperationAction(ISD::FMINIMUM, MVT::f64, Custom); | |||
| 1055 | ||||
| 1056 | for (auto VT : { MVT::v2i8, MVT::v4i8, MVT::v8i8, | |||
| 1057 | MVT::v2i16, MVT::v4i16, MVT::v2i32 }) { | |||
| 1058 | setOperationAction(ISD::SDIV, VT, Custom); | |||
| 1059 | setOperationAction(ISD::SREM, VT, Custom); | |||
| 1060 | setOperationAction(ISD::UDIV, VT, Custom); | |||
| 1061 | setOperationAction(ISD::UREM, VT, Custom); | |||
| 1062 | } | |||
| 1063 | ||||
| 1064 | setOperationAction(ISD::MUL, MVT::v2i8, Custom); | |||
| 1065 | setOperationAction(ISD::MUL, MVT::v4i8, Custom); | |||
| 1066 | setOperationAction(ISD::MUL, MVT::v8i8, Custom); | |||
| 1067 | ||||
| 1068 | setOperationAction(ISD::MUL, MVT::v16i8, Custom); | |||
| 1069 | setOperationAction(ISD::MUL, MVT::v4i32, Custom); | |||
| 1070 | setOperationAction(ISD::MUL, MVT::v2i64, Custom); | |||
| 1071 | setOperationAction(ISD::MULHU, MVT::v4i32, Custom); | |||
| 1072 | setOperationAction(ISD::MULHS, MVT::v4i32, Custom); | |||
| 1073 | setOperationAction(ISD::MULHU, MVT::v16i8, Custom); | |||
| 1074 | setOperationAction(ISD::MULHS, MVT::v16i8, Custom); | |||
| 1075 | setOperationAction(ISD::MULHU, MVT::v8i16, Legal); | |||
| 1076 | setOperationAction(ISD::MULHS, MVT::v8i16, Legal); | |||
| 1077 | setOperationAction(ISD::MUL, MVT::v8i16, Legal); | |||
| 1078 | setOperationAction(ISD::AVGCEILU, MVT::v16i8, Legal); | |||
| 1079 | setOperationAction(ISD::AVGCEILU, MVT::v8i16, Legal); | |||
| 1080 | ||||
| 1081 | setOperationAction(ISD::SMULO, MVT::v16i8, Custom); | |||
| 1082 | setOperationAction(ISD::UMULO, MVT::v16i8, Custom); | |||
| 1083 | setOperationAction(ISD::UMULO, MVT::v2i32, Custom); | |||
| 1084 | ||||
| 1085 | setOperationAction(ISD::FNEG, MVT::v2f64, Custom); | |||
| 1086 | setOperationAction(ISD::FABS, MVT::v2f64, Custom); | |||
| 1087 | setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom); | |||
| 1088 | ||||
| 1089 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { | |||
| 1090 | setOperationAction(ISD::SMAX, VT, VT == MVT::v8i16 ? Legal : Custom); | |||
| 1091 | setOperationAction(ISD::SMIN, VT, VT == MVT::v8i16 ? Legal : Custom); | |||
| 1092 | setOperationAction(ISD::UMAX, VT, VT == MVT::v16i8 ? Legal : Custom); | |||
| 1093 | setOperationAction(ISD::UMIN, VT, VT == MVT::v16i8 ? Legal : Custom); | |||
| 1094 | } | |||
| 1095 | ||||
| 1096 | setOperationAction(ISD::ABDU, MVT::v16i8, Custom); | |||
| 1097 | setOperationAction(ISD::ABDU, MVT::v8i16, Custom); | |||
| 1098 | setOperationAction(ISD::ABDS, MVT::v8i16, Custom); | |||
| 1099 | ||||
| 1100 | setOperationAction(ISD::UADDSAT, MVT::v16i8, Legal); | |||
| 1101 | setOperationAction(ISD::SADDSAT, MVT::v16i8, Legal); | |||
| 1102 | setOperationAction(ISD::USUBSAT, MVT::v16i8, Legal); | |||
| 1103 | setOperationAction(ISD::SSUBSAT, MVT::v16i8, Legal); | |||
| 1104 | setOperationAction(ISD::UADDSAT, MVT::v8i16, Legal); | |||
| 1105 | setOperationAction(ISD::SADDSAT, MVT::v8i16, Legal); | |||
| 1106 | setOperationAction(ISD::USUBSAT, MVT::v8i16, Legal); | |||
| 1107 | setOperationAction(ISD::SSUBSAT, MVT::v8i16, Legal); | |||
| 1108 | setOperationAction(ISD::USUBSAT, MVT::v4i32, Custom); | |||
| 1109 | setOperationAction(ISD::USUBSAT, MVT::v2i64, Custom); | |||
| 1110 | ||||
| 1111 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); | |||
| 1112 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); | |||
| 1113 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); | |||
| 1114 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); | |||
| 1115 | ||||
| 1116 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { | |||
| 1117 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 1118 | setOperationAction(ISD::STRICT_FSETCC, VT, Custom); | |||
| 1119 | setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); | |||
| 1120 | setOperationAction(ISD::CTPOP, VT, Custom); | |||
| 1121 | setOperationAction(ISD::ABS, VT, Custom); | |||
| 1122 | ||||
| 1123 | // The condition codes aren't legal in SSE/AVX and under AVX512 we use | |||
| 1124 | // setcc all the way to isel and prefer SETGT in some isel patterns. | |||
| 1125 | setCondCodeAction(ISD::SETLT, VT, Custom); | |||
| 1126 | setCondCodeAction(ISD::SETLE, VT, Custom); | |||
| 1127 | } | |||
| 1128 | ||||
| 1129 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) { | |||
| 1130 | setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); | |||
| 1131 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 1132 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 1133 | setOperationAction(ISD::VSELECT, VT, Custom); | |||
| 1134 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 1135 | } | |||
| 1136 | ||||
| 1137 | for (auto VT : { MVT::v8f16, MVT::v2f64, MVT::v2i64 }) { | |||
| 1138 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 1139 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 1140 | setOperationAction(ISD::VSELECT, VT, Custom); | |||
| 1141 | ||||
| 1142 | if (VT == MVT::v2i64 && !Subtarget.is64Bit()) | |||
| 1143 | continue; | |||
| 1144 | ||||
| 1145 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); | |||
| 1146 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 1147 | } | |||
| 1148 | setF16Action(MVT::v8f16, Expand); | |||
| 1149 | setOperationAction(ISD::FADD, MVT::v8f16, Expand); | |||
| 1150 | setOperationAction(ISD::FSUB, MVT::v8f16, Expand); | |||
| 1151 | setOperationAction(ISD::FMUL, MVT::v8f16, Expand); | |||
| 1152 | setOperationAction(ISD::FDIV, MVT::v8f16, Expand); | |||
| 1153 | ||||
| 1154 | // Custom lower v2i64 and v2f64 selects. | |||
| 1155 | setOperationAction(ISD::SELECT, MVT::v2f64, Custom); | |||
| 1156 | setOperationAction(ISD::SELECT, MVT::v2i64, Custom); | |||
| 1157 | setOperationAction(ISD::SELECT, MVT::v4i32, Custom); | |||
| 1158 | setOperationAction(ISD::SELECT, MVT::v8i16, Custom); | |||
| 1159 | setOperationAction(ISD::SELECT, MVT::v8f16, Custom); | |||
| 1160 | setOperationAction(ISD::SELECT, MVT::v16i8, Custom); | |||
| 1161 | ||||
| 1162 | setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Custom); | |||
| 1163 | setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Custom); | |||
| 1164 | setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom); | |||
| 1165 | setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); | |||
| 1166 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Custom); | |||
| 1167 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom); | |||
| 1168 | ||||
| 1169 | // Custom legalize these to avoid over promotion or custom promotion. | |||
| 1170 | for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) { | |||
| 1171 | setOperationAction(ISD::FP_TO_SINT, VT, Custom); | |||
| 1172 | setOperationAction(ISD::FP_TO_UINT, VT, Custom); | |||
| 1173 | setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); | |||
| 1174 | setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); | |||
| 1175 | } | |||
| 1176 | ||||
| 1177 | setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Custom); | |||
| 1178 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Custom); | |||
| 1179 | setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); | |||
| 1180 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom); | |||
| 1181 | ||||
| 1182 | setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); | |||
| 1183 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom); | |||
| 1184 | ||||
| 1185 | setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom); | |||
| 1186 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom); | |||
| 1187 | ||||
| 1188 | // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. | |||
| 1189 | setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); | |||
| 1190 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom); | |||
| 1191 | setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); | |||
| 1192 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom); | |||
| 1193 | ||||
| 1194 | setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); | |||
| 1195 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom); | |||
| 1196 | setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); | |||
| 1197 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom); | |||
| 1198 | ||||
| 1199 | // We want to legalize this to an f64 load rather than an i64 load on | |||
| 1200 | // 64-bit targets and two 32-bit loads on a 32-bit target. Similar for | |||
| 1201 | // store. | |||
| 1202 | setOperationAction(ISD::LOAD, MVT::v2i32, Custom); | |||
| 1203 | setOperationAction(ISD::LOAD, MVT::v4i16, Custom); | |||
| 1204 | setOperationAction(ISD::LOAD, MVT::v8i8, Custom); | |||
| 1205 | setOperationAction(ISD::STORE, MVT::v2i32, Custom); | |||
| 1206 | setOperationAction(ISD::STORE, MVT::v4i16, Custom); | |||
| 1207 | setOperationAction(ISD::STORE, MVT::v8i8, Custom); | |||
| 1208 | ||||
| 1209 | // Add 32-bit vector stores to help vectorization opportunities. | |||
| 1210 | setOperationAction(ISD::STORE, MVT::v2i16, Custom); | |||
| 1211 | setOperationAction(ISD::STORE, MVT::v4i8, Custom); | |||
| 1212 | ||||
| 1213 | setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); | |||
| 1214 | setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); | |||
| 1215 | setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); | |||
| 1216 | if (!Subtarget.hasAVX512()) | |||
| 1217 | setOperationAction(ISD::BITCAST, MVT::v16i1, Custom); | |||
| 1218 | ||||
| 1219 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom); | |||
| 1220 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom); | |||
| 1221 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom); | |||
| 1222 | ||||
| 1223 | setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); | |||
| 1224 | ||||
| 1225 | setOperationAction(ISD::TRUNCATE, MVT::v2i8, Custom); | |||
| 1226 | setOperationAction(ISD::TRUNCATE, MVT::v2i16, Custom); | |||
| 1227 | setOperationAction(ISD::TRUNCATE, MVT::v2i32, Custom); | |||
| 1228 | setOperationAction(ISD::TRUNCATE, MVT::v4i8, Custom); | |||
| 1229 | setOperationAction(ISD::TRUNCATE, MVT::v4i16, Custom); | |||
| 1230 | setOperationAction(ISD::TRUNCATE, MVT::v8i8, Custom); | |||
| 1231 | ||||
| 1232 | // In the customized shift lowering, the legal v4i32/v2i64 cases | |||
| 1233 | // in AVX2 will be recognized. | |||
| 1234 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { | |||
| 1235 | setOperationAction(ISD::SRL, VT, Custom); | |||
| 1236 | setOperationAction(ISD::SHL, VT, Custom); | |||
| 1237 | setOperationAction(ISD::SRA, VT, Custom); | |||
| 1238 | if (VT == MVT::v2i64) continue; | |||
| 1239 | setOperationAction(ISD::ROTL, VT, Custom); | |||
| 1240 | setOperationAction(ISD::ROTR, VT, Custom); | |||
| 1241 | setOperationAction(ISD::FSHL, VT, Custom); | |||
| 1242 | setOperationAction(ISD::FSHR, VT, Custom); | |||
| 1243 | } | |||
| 1244 | ||||
| 1245 | setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal); | |||
| 1246 | setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal); | |||
| 1247 | setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal); | |||
| 1248 | setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal); | |||
| 1249 | setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal); | |||
| 1250 | } | |||
| 1251 | ||||
| 1252 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) { | |||
| 1253 | setOperationAction(ISD::ABS, MVT::v16i8, Legal); | |||
| 1254 | setOperationAction(ISD::ABS, MVT::v8i16, Legal); | |||
| 1255 | setOperationAction(ISD::ABS, MVT::v4i32, Legal); | |||
| 1256 | setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom); | |||
| 1257 | setOperationAction(ISD::CTLZ, MVT::v16i8, Custom); | |||
| 1258 | setOperationAction(ISD::CTLZ, MVT::v8i16, Custom); | |||
| 1259 | setOperationAction(ISD::CTLZ, MVT::v4i32, Custom); | |||
| 1260 | setOperationAction(ISD::CTLZ, MVT::v2i64, Custom); | |||
| 1261 | ||||
| 1262 | // These might be better off as horizontal vector ops. | |||
| 1263 | setOperationAction(ISD::ADD, MVT::i16, Custom); | |||
| 1264 | setOperationAction(ISD::ADD, MVT::i32, Custom); | |||
| 1265 | setOperationAction(ISD::SUB, MVT::i16, Custom); | |||
| 1266 | setOperationAction(ISD::SUB, MVT::i32, Custom); | |||
| 1267 | } | |||
| 1268 | ||||
| 1269 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) { | |||
| 1270 | for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) { | |||
| 1271 | setOperationAction(ISD::FFLOOR, RoundedTy, Legal); | |||
| 1272 | setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal); | |||
| 1273 | setOperationAction(ISD::FCEIL, RoundedTy, Legal); | |||
| 1274 | setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal); | |||
| 1275 | setOperationAction(ISD::FTRUNC, RoundedTy, Legal); | |||
| 1276 | setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal); | |||
| 1277 | setOperationAction(ISD::FRINT, RoundedTy, Legal); | |||
| 1278 | setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal); | |||
| 1279 | setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal); | |||
| 1280 | setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal); | |||
| 1281 | setOperationAction(ISD::FROUNDEVEN, RoundedTy, Legal); | |||
| 1282 | setOperationAction(ISD::STRICT_FROUNDEVEN, RoundedTy, Legal); | |||
| 1283 | ||||
| 1284 | setOperationAction(ISD::FROUND, RoundedTy, Custom); | |||
| 1285 | } | |||
| 1286 | ||||
| 1287 | setOperationAction(ISD::SMAX, MVT::v16i8, Legal); | |||
| 1288 | setOperationAction(ISD::SMAX, MVT::v4i32, Legal); | |||
| 1289 | setOperationAction(ISD::UMAX, MVT::v8i16, Legal); | |||
| 1290 | setOperationAction(ISD::UMAX, MVT::v4i32, Legal); | |||
| 1291 | setOperationAction(ISD::SMIN, MVT::v16i8, Legal); | |||
| 1292 | setOperationAction(ISD::SMIN, MVT::v4i32, Legal); | |||
| 1293 | setOperationAction(ISD::UMIN, MVT::v8i16, Legal); | |||
| 1294 | setOperationAction(ISD::UMIN, MVT::v4i32, Legal); | |||
| 1295 | ||||
| 1296 | for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64}) { | |||
| 1297 | setOperationAction(ISD::ABDS, VT, Custom); | |||
| 1298 | setOperationAction(ISD::ABDU, VT, Custom); | |||
| 1299 | } | |||
| 1300 | ||||
| 1301 | setOperationAction(ISD::UADDSAT, MVT::v4i32, Custom); | |||
| 1302 | setOperationAction(ISD::SADDSAT, MVT::v2i64, Custom); | |||
| 1303 | setOperationAction(ISD::SSUBSAT, MVT::v2i64, Custom); | |||
| 1304 | ||||
| 1305 | // FIXME: Do we need to handle scalar-to-vector here? | |||
| 1306 | setOperationAction(ISD::MUL, MVT::v4i32, Legal); | |||
| 1307 | setOperationAction(ISD::SMULO, MVT::v2i32, Custom); | |||
| 1308 | ||||
| 1309 | // We directly match byte blends in the backend as they match the VSELECT | |||
| 1310 | // condition form. | |||
| 1311 | setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); | |||
| 1312 | ||||
| 1313 | // SSE41 brings specific instructions for doing vector sign extend even in | |||
| 1314 | // cases where we don't have SRA. | |||
| 1315 | for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { | |||
| 1316 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal); | |||
| 1317 | setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal); | |||
| 1318 | } | |||
| 1319 | ||||
| 1320 | // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X | |||
| 1321 | for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { | |||
| 1322 | setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); | |||
| 1323 | setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); | |||
| 1324 | setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); | |||
| 1325 | setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); | |||
| 1326 | setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); | |||
| 1327 | setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal); | |||
| 1328 | } | |||
| 1329 | ||||
| 1330 | if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) { | |||
| 1331 | // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can | |||
| 1332 | // do the pre and post work in the vector domain. | |||
| 1333 | setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom); | |||
| 1334 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom); | |||
| 1335 | // We need to mark SINT_TO_FP as Custom even though we want to expand it | |||
| 1336 | // so that DAG combine doesn't try to turn it into uint_to_fp. | |||
| 1337 | setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom); | |||
| 1338 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom); | |||
| 1339 | } | |||
| 1340 | } | |||
| 1341 | ||||
| 1342 | if (!Subtarget.useSoftFloat() && Subtarget.hasSSE42()) { | |||
| 1343 | setOperationAction(ISD::UADDSAT, MVT::v2i64, Custom); | |||
| 1344 | } | |||
| 1345 | ||||
| 1346 | if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) { | |||
| 1347 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, | |||
| 1348 | MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { | |||
| 1349 | setOperationAction(ISD::ROTL, VT, Custom); | |||
| 1350 | setOperationAction(ISD::ROTR, VT, Custom); | |||
| 1351 | } | |||
| 1352 | ||||
| 1353 | // XOP can efficiently perform BITREVERSE with VPPERM. | |||
| 1354 | for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) | |||
| 1355 | setOperationAction(ISD::BITREVERSE, VT, Custom); | |||
| 1356 | ||||
| 1357 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, | |||
| 1358 | MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) | |||
| 1359 | setOperationAction(ISD::BITREVERSE, VT, Custom); | |||
| 1360 | } | |||
| 1361 | ||||
| 1362 | if (!Subtarget.useSoftFloat() && Subtarget.hasAVX()) { | |||
| 1363 | bool HasInt256 = Subtarget.hasInt256(); | |||
| 1364 | ||||
| 1365 | addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1366 | : &X86::VR256RegClass); | |||
| 1367 | addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1368 | : &X86::VR256RegClass); | |||
| 1369 | addRegisterClass(MVT::v16f16, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1370 | : &X86::VR256RegClass); | |||
| 1371 | addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1372 | : &X86::VR256RegClass); | |||
| 1373 | addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1374 | : &X86::VR256RegClass); | |||
| 1375 | addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1376 | : &X86::VR256RegClass); | |||
| 1377 | addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass | |||
| 1378 | : &X86::VR256RegClass); | |||
| 1379 | ||||
| 1380 | for (auto VT : { MVT::v8f32, MVT::v4f64 }) { | |||
| 1381 | setOperationAction(ISD::FFLOOR, VT, Legal); | |||
| 1382 | setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); | |||
| 1383 | setOperationAction(ISD::FCEIL, VT, Legal); | |||
| 1384 | setOperationAction(ISD::STRICT_FCEIL, VT, Legal); | |||
| 1385 | setOperationAction(ISD::FTRUNC, VT, Legal); | |||
| 1386 | setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); | |||
| 1387 | setOperationAction(ISD::FRINT, VT, Legal); | |||
| 1388 | setOperationAction(ISD::STRICT_FRINT, VT, Legal); | |||
| 1389 | setOperationAction(ISD::FNEARBYINT, VT, Legal); | |||
| 1390 | setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); | |||
| 1391 | setOperationAction(ISD::FROUNDEVEN, VT, Legal); | |||
| 1392 | setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); | |||
| 1393 | ||||
| 1394 | setOperationAction(ISD::FROUND, VT, Custom); | |||
| 1395 | ||||
| 1396 | setOperationAction(ISD::FNEG, VT, Custom); | |||
| 1397 | setOperationAction(ISD::FABS, VT, Custom); | |||
| 1398 | setOperationAction(ISD::FCOPYSIGN, VT, Custom); | |||
| 1399 | } | |||
| 1400 | ||||
| 1401 | // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted | |||
| 1402 | // even though v8i16 is a legal type. | |||
| 1403 | setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32); | |||
| 1404 | setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32); | |||
| 1405 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32); | |||
| 1406 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32); | |||
| 1407 | setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Custom); | |||
| 1408 | setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Custom); | |||
| 1409 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Custom); | |||
| 1410 | ||||
| 1411 | setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Custom); | |||
| 1412 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Custom); | |||
| 1413 | setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Expand); | |||
| 1414 | setOperationAction(ISD::FP_ROUND, MVT::v8f16, Expand); | |||
| 1415 | setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Custom); | |||
| 1416 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Custom); | |||
| 1417 | ||||
| 1418 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal); | |||
| 1419 | setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal); | |||
| 1420 | setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal); | |||
| 1421 | setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal); | |||
| 1422 | setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal); | |||
| 1423 | setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal); | |||
| 1424 | setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal); | |||
| 1425 | setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal); | |||
| 1426 | setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal); | |||
| 1427 | setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal); | |||
| 1428 | setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal); | |||
| 1429 | ||||
| 1430 | if (!Subtarget.hasAVX512()) | |||
| 1431 | setOperationAction(ISD::BITCAST, MVT::v32i1, Custom); | |||
| 1432 | ||||
| 1433 | // In the customized shift lowering, the legal v8i32/v4i64 cases | |||
| 1434 | // in AVX2 will be recognized. | |||
| 1435 | for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { | |||
| 1436 | setOperationAction(ISD::SRL, VT, Custom); | |||
| 1437 | setOperationAction(ISD::SHL, VT, Custom); | |||
| 1438 | setOperationAction(ISD::SRA, VT, Custom); | |||
| 1439 | setOperationAction(ISD::ABDS, VT, Custom); | |||
| 1440 | setOperationAction(ISD::ABDU, VT, Custom); | |||
| 1441 | if (VT == MVT::v4i64) continue; | |||
| 1442 | setOperationAction(ISD::ROTL, VT, Custom); | |||
| 1443 | setOperationAction(ISD::ROTR, VT, Custom); | |||
| 1444 | setOperationAction(ISD::FSHL, VT, Custom); | |||
| 1445 | setOperationAction(ISD::FSHR, VT, Custom); | |||
| 1446 | } | |||
| 1447 | ||||
| 1448 | // These types need custom splitting if their input is a 128-bit vector. | |||
| 1449 | setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); | |||
| 1450 | setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); | |||
| 1451 | setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); | |||
| 1452 | setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); | |||
| 1453 | ||||
| 1454 | setOperationAction(ISD::SELECT, MVT::v4f64, Custom); | |||
| 1455 | setOperationAction(ISD::SELECT, MVT::v4i64, Custom); | |||
| 1456 | setOperationAction(ISD::SELECT, MVT::v8i32, Custom); | |||
| 1457 | setOperationAction(ISD::SELECT, MVT::v16i16, Custom); | |||
| 1458 | setOperationAction(ISD::SELECT, MVT::v16f16, Custom); | |||
| 1459 | setOperationAction(ISD::SELECT, MVT::v32i8, Custom); | |||
| 1460 | setOperationAction(ISD::SELECT, MVT::v8f32, Custom); | |||
| 1461 | ||||
| 1462 | for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { | |||
| 1463 | setOperationAction(ISD::SIGN_EXTEND, VT, Custom); | |||
| 1464 | setOperationAction(ISD::ZERO_EXTEND, VT, Custom); | |||
| 1465 | setOperationAction(ISD::ANY_EXTEND, VT, Custom); | |||
| 1466 | } | |||
| 1467 | ||||
| 1468 | setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); | |||
| 1469 | setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); | |||
| 1470 | setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); | |||
| 1471 | setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom); | |||
| 1472 | ||||
| 1473 | for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { | |||
| 1474 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 1475 | setOperationAction(ISD::STRICT_FSETCC, VT, Custom); | |||
| 1476 | setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); | |||
| 1477 | setOperationAction(ISD::CTPOP, VT, Custom); | |||
| 1478 | setOperationAction(ISD::CTLZ, VT, Custom); | |||
| 1479 | ||||
| 1480 | // The condition codes aren't legal in SSE/AVX and under AVX512 we use | |||
| 1481 | // setcc all the way to isel and prefer SETGT in some isel patterns. | |||
| 1482 | setCondCodeAction(ISD::SETLT, VT, Custom); | |||
| 1483 | setCondCodeAction(ISD::SETLE, VT, Custom); | |||
| 1484 | } | |||
| 1485 | ||||
| 1486 | if (Subtarget.hasAnyFMA()) { | |||
| 1487 | for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32, | |||
| 1488 | MVT::v2f64, MVT::v4f64 }) { | |||
| 1489 | setOperationAction(ISD::FMA, VT, Legal); | |||
| 1490 | setOperationAction(ISD::STRICT_FMA, VT, Legal); | |||
| 1491 | } | |||
| 1492 | } | |||
| 1493 | ||||
| 1494 | for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) { | |||
| 1495 | setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom); | |||
| 1496 | setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom); | |||
| 1497 | } | |||
| 1498 | ||||
| 1499 | setOperationAction(ISD::MUL, MVT::v4i64, Custom); | |||
| 1500 | setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom); | |||
| 1501 | setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1502 | setOperationAction(ISD::MUL, MVT::v32i8, Custom); | |||
| 1503 | ||||
| 1504 | setOperationAction(ISD::MULHU, MVT::v8i32, Custom); | |||
| 1505 | setOperationAction(ISD::MULHS, MVT::v8i32, Custom); | |||
| 1506 | setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1507 | setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1508 | setOperationAction(ISD::MULHU, MVT::v32i8, Custom); | |||
| 1509 | setOperationAction(ISD::MULHS, MVT::v32i8, Custom); | |||
| 1510 | setOperationAction(ISD::AVGCEILU, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1511 | setOperationAction(ISD::AVGCEILU, MVT::v32i8, HasInt256 ? Legal : Custom); | |||
| 1512 | ||||
| 1513 | setOperationAction(ISD::SMULO, MVT::v32i8, Custom); | |||
| 1514 | setOperationAction(ISD::UMULO, MVT::v32i8, Custom); | |||
| 1515 | ||||
| 1516 | setOperationAction(ISD::ABS, MVT::v4i64, Custom); | |||
| 1517 | setOperationAction(ISD::SMAX, MVT::v4i64, Custom); | |||
| 1518 | setOperationAction(ISD::UMAX, MVT::v4i64, Custom); | |||
| 1519 | setOperationAction(ISD::SMIN, MVT::v4i64, Custom); | |||
| 1520 | setOperationAction(ISD::UMIN, MVT::v4i64, Custom); | |||
| 1521 | ||||
| 1522 | setOperationAction(ISD::UADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); | |||
| 1523 | setOperationAction(ISD::SADDSAT, MVT::v32i8, HasInt256 ? Legal : Custom); | |||
| 1524 | setOperationAction(ISD::USUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); | |||
| 1525 | setOperationAction(ISD::SSUBSAT, MVT::v32i8, HasInt256 ? Legal : Custom); | |||
| 1526 | setOperationAction(ISD::UADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1527 | setOperationAction(ISD::SADDSAT, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1528 | setOperationAction(ISD::USUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1529 | setOperationAction(ISD::SSUBSAT, MVT::v16i16, HasInt256 ? Legal : Custom); | |||
| 1530 | setOperationAction(ISD::UADDSAT, MVT::v8i32, Custom); | |||
| 1531 | setOperationAction(ISD::USUBSAT, MVT::v8i32, Custom); | |||
| 1532 | setOperationAction(ISD::UADDSAT, MVT::v4i64, Custom); | |||
| 1533 | setOperationAction(ISD::USUBSAT, MVT::v4i64, Custom); | |||
| 1534 | ||||
| 1535 | for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) { | |||
| 1536 | setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom); | |||
| 1537 | setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom); | |||
| 1538 | setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom); | |||
| 1539 | setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom); | |||
| 1540 | setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom); | |||
| 1541 | } | |||
| 1542 | ||||
| 1543 | for (auto VT : {MVT::v16i16, MVT::v8i32, MVT::v4i64}) { | |||
| 1544 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); | |||
| 1545 | setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); | |||
| 1546 | } | |||
| 1547 | ||||
| 1548 | if (HasInt256) { | |||
| 1549 | // The custom lowering for UINT_TO_FP for v8i32 becomes interesting | |||
| 1550 | // when we have a 256bit-wide blend with immediate. | |||
| 1551 | setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom); | |||
| 1552 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom); | |||
| 1553 | ||||
| 1554 | // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X | |||
| 1555 | for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { | |||
| 1556 | setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal); | |||
| 1557 | setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal); | |||
| 1558 | setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal); | |||
| 1559 | setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal); | |||
| 1560 | setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal); | |||
| 1561 | setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal); | |||
| 1562 | } | |||
| 1563 | } | |||
| 1564 | ||||
| 1565 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, | |||
| 1566 | MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { | |||
| 1567 | setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); | |||
| 1568 | setOperationAction(ISD::MSTORE, VT, Legal); | |||
| 1569 | } | |||
| 1570 | ||||
| 1571 | // Extract subvector is special because the value type | |||
| 1572 | // (result) is 128-bit but the source is 256-bit wide. | |||
| 1573 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, | |||
| 1574 | MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { | |||
| 1575 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); | |||
| 1576 | } | |||
| 1577 | ||||
| 1578 | // Custom lower several nodes for 256-bit types. | |||
| 1579 | for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, | |||
| 1580 | MVT::v16f16, MVT::v8f32, MVT::v4f64 }) { | |||
| 1581 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 1582 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 1583 | setOperationAction(ISD::VSELECT, VT, Custom); | |||
| 1584 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); | |||
| 1585 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 1586 | setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); | |||
| 1587 | setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); | |||
| 1588 | setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); | |||
| 1589 | setOperationAction(ISD::STORE, VT, Custom); | |||
| 1590 | } | |||
| 1591 | setF16Action(MVT::v16f16, Expand); | |||
| 1592 | setOperationAction(ISD::FADD, MVT::v16f16, Expand); | |||
| 1593 | setOperationAction(ISD::FSUB, MVT::v16f16, Expand); | |||
| 1594 | setOperationAction(ISD::FMUL, MVT::v16f16, Expand); | |||
| 1595 | setOperationAction(ISD::FDIV, MVT::v16f16, Expand); | |||
| 1596 | ||||
| 1597 | if (HasInt256) { | |||
| 1598 | setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); | |||
| 1599 | ||||
| 1600 | // Custom legalize 2x32 to get a little better code. | |||
| 1601 | setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); | |||
| 1602 | setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); | |||
| 1603 | ||||
| 1604 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, | |||
| 1605 | MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) | |||
| 1606 | setOperationAction(ISD::MGATHER, VT, Custom); | |||
| 1607 | } | |||
| 1608 | } | |||
| 1609 | ||||
| 1610 | if (!Subtarget.useSoftFloat() && !Subtarget.hasFP16() && | |||
| 1611 | Subtarget.hasF16C()) { | |||
| 1612 | for (MVT VT : { MVT::f16, MVT::v2f16, MVT::v4f16, MVT::v8f16 }) { | |||
| 1613 | setOperationAction(ISD::FP_ROUND, VT, Custom); | |||
| 1614 | setOperationAction(ISD::STRICT_FP_ROUND, VT, Custom); | |||
| 1615 | } | |||
| 1616 | for (MVT VT : { MVT::f32, MVT::v2f32, MVT::v4f32 }) { | |||
| 1617 | setOperationAction(ISD::FP_EXTEND, VT, Custom); | |||
| 1618 | setOperationAction(ISD::STRICT_FP_EXTEND, VT, Custom); | |||
| 1619 | } | |||
| 1620 | for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { | |||
| 1621 | setOperationPromotedToType(Opc, MVT::v8f16, MVT::v8f32); | |||
| 1622 | setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); | |||
| 1623 | } | |||
| 1624 | ||||
| 1625 | setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); | |||
| 1626 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); | |||
| 1627 | } | |||
| 1628 | ||||
| 1629 | // This block controls legalization of the mask vector sizes that are | |||
| 1630 | // available with AVX512. 512-bit vectors are in a separate block controlled | |||
| 1631 | // by useAVX512Regs. | |||
| 1632 | if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { | |||
| 1633 | addRegisterClass(MVT::v1i1, &X86::VK1RegClass); | |||
| 1634 | addRegisterClass(MVT::v2i1, &X86::VK2RegClass); | |||
| 1635 | addRegisterClass(MVT::v4i1, &X86::VK4RegClass); | |||
| 1636 | addRegisterClass(MVT::v8i1, &X86::VK8RegClass); | |||
| 1637 | addRegisterClass(MVT::v16i1, &X86::VK16RegClass); | |||
| 1638 | ||||
| 1639 | setOperationAction(ISD::SELECT, MVT::v1i1, Custom); | |||
| 1640 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); | |||
| 1641 | setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); | |||
| 1642 | ||||
| 1643 | setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32); | |||
| 1644 | setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32); | |||
| 1645 | setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32); | |||
| 1646 | setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32); | |||
| 1647 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32); | |||
| 1648 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32); | |||
| 1649 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32); | |||
| 1650 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32); | |||
| 1651 | setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom); | |||
| 1652 | setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom); | |||
| 1653 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom); | |||
| 1654 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom); | |||
| 1655 | ||||
| 1656 | // There is no byte sized k-register load or store without AVX512DQ. | |||
| 1657 | if (!Subtarget.hasDQI()) { | |||
| 1658 | setOperationAction(ISD::LOAD, MVT::v1i1, Custom); | |||
| 1659 | setOperationAction(ISD::LOAD, MVT::v2i1, Custom); | |||
| 1660 | setOperationAction(ISD::LOAD, MVT::v4i1, Custom); | |||
| 1661 | setOperationAction(ISD::LOAD, MVT::v8i1, Custom); | |||
| 1662 | ||||
| 1663 | setOperationAction(ISD::STORE, MVT::v1i1, Custom); | |||
| 1664 | setOperationAction(ISD::STORE, MVT::v2i1, Custom); | |||
| 1665 | setOperationAction(ISD::STORE, MVT::v4i1, Custom); | |||
| 1666 | setOperationAction(ISD::STORE, MVT::v8i1, Custom); | |||
| 1667 | } | |||
| 1668 | ||||
| 1669 | // Extends of v16i1/v8i1/v4i1/v2i1 to 128-bit vectors. | |||
| 1670 | for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { | |||
| 1671 | setOperationAction(ISD::SIGN_EXTEND, VT, Custom); | |||
| 1672 | setOperationAction(ISD::ZERO_EXTEND, VT, Custom); | |||
| 1673 | setOperationAction(ISD::ANY_EXTEND, VT, Custom); | |||
| 1674 | } | |||
| 1675 | ||||
| 1676 | for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) | |||
| 1677 | setOperationAction(ISD::VSELECT, VT, Expand); | |||
| 1678 | ||||
| 1679 | for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) { | |||
| 1680 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 1681 | setOperationAction(ISD::STRICT_FSETCC, VT, Custom); | |||
| 1682 | setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); | |||
| 1683 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 1684 | setOperationAction(ISD::TRUNCATE, VT, Custom); | |||
| 1685 | ||||
| 1686 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 1687 | setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); | |||
| 1688 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 1689 | setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); | |||
| 1690 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); | |||
| 1691 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 1692 | } | |||
| 1693 | ||||
| 1694 | for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 }) | |||
| 1695 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); | |||
| 1696 | } | |||
| 1697 | ||||
| 1698 | // This block controls legalization for 512-bit operations with 8/16/32/64 bit | |||
| 1699 | // elements. 512-bits can be disabled based on prefer-vector-width and | |||
| 1700 | // required-vector-width function attributes. | |||
| 1701 | if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) { | |||
| 1702 | bool HasBWI = Subtarget.hasBWI(); | |||
| 1703 | ||||
| 1704 | addRegisterClass(MVT::v16i32, &X86::VR512RegClass); | |||
| 1705 | addRegisterClass(MVT::v16f32, &X86::VR512RegClass); | |||
| 1706 | addRegisterClass(MVT::v8i64, &X86::VR512RegClass); | |||
| 1707 | addRegisterClass(MVT::v8f64, &X86::VR512RegClass); | |||
| 1708 | addRegisterClass(MVT::v32i16, &X86::VR512RegClass); | |||
| 1709 | addRegisterClass(MVT::v32f16, &X86::VR512RegClass); | |||
| 1710 | addRegisterClass(MVT::v64i8, &X86::VR512RegClass); | |||
| 1711 | ||||
| 1712 | for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { | |||
| 1713 | setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); | |||
| 1714 | setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); | |||
| 1715 | setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); | |||
| 1716 | setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); | |||
| 1717 | setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); | |||
| 1718 | if (HasBWI) | |||
| 1719 | setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); | |||
| 1720 | } | |||
| 1721 | ||||
| 1722 | for (MVT VT : { MVT::v16f32, MVT::v8f64 }) { | |||
| 1723 | setOperationAction(ISD::FNEG, VT, Custom); | |||
| 1724 | setOperationAction(ISD::FABS, VT, Custom); | |||
| 1725 | setOperationAction(ISD::FMA, VT, Legal); | |||
| 1726 | setOperationAction(ISD::STRICT_FMA, VT, Legal); | |||
| 1727 | setOperationAction(ISD::FCOPYSIGN, VT, Custom); | |||
| 1728 | } | |||
| 1729 | ||||
| 1730 | for (MVT VT : { MVT::v16i1, MVT::v16i8 }) { | |||
| 1731 | setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32); | |||
| 1732 | setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32); | |||
| 1733 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32); | |||
| 1734 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32); | |||
| 1735 | } | |||
| 1736 | ||||
| 1737 | for (MVT VT : { MVT::v16i16, MVT::v16i32 }) { | |||
| 1738 | setOperationAction(ISD::FP_TO_SINT, VT, Custom); | |||
| 1739 | setOperationAction(ISD::FP_TO_UINT, VT, Custom); | |||
| 1740 | setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom); | |||
| 1741 | setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom); | |||
| 1742 | } | |||
| 1743 | ||||
| 1744 | setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Custom); | |||
| 1745 | setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Custom); | |||
| 1746 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Custom); | |||
| 1747 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Custom); | |||
| 1748 | setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Custom); | |||
| 1749 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Custom); | |||
| 1750 | ||||
| 1751 | setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal); | |||
| 1752 | setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal); | |||
| 1753 | setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal); | |||
| 1754 | setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal); | |||
| 1755 | setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal); | |||
| 1756 | setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal); | |||
| 1757 | setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal); | |||
| 1758 | setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal); | |||
| 1759 | setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal); | |||
| 1760 | setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal); | |||
| 1761 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal); | |||
| 1762 | ||||
| 1763 | setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); | |||
| 1764 | setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); | |||
| 1765 | setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); | |||
| 1766 | setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); | |||
| 1767 | setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); | |||
| 1768 | if (HasBWI) | |||
| 1769 | setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); | |||
| 1770 | ||||
| 1771 | // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE | |||
| 1772 | // to 512-bit rather than use the AVX2 instructions so that we can use | |||
| 1773 | // k-masks. | |||
| 1774 | if (!Subtarget.hasVLX()) { | |||
| 1775 | for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, | |||
| 1776 | MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { | |||
| 1777 | setOperationAction(ISD::MLOAD, VT, Custom); | |||
| 1778 | setOperationAction(ISD::MSTORE, VT, Custom); | |||
| 1779 | } | |||
| 1780 | } | |||
| 1781 | ||||
| 1782 | setOperationAction(ISD::TRUNCATE, MVT::v8i32, Legal); | |||
| 1783 | setOperationAction(ISD::TRUNCATE, MVT::v16i16, Legal); | |||
| 1784 | setOperationAction(ISD::TRUNCATE, MVT::v32i8, HasBWI ? Legal : Custom); | |||
| 1785 | setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); | |||
| 1786 | setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); | |||
| 1787 | setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); | |||
| 1788 | setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); | |||
| 1789 | setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); | |||
| 1790 | setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom); | |||
| 1791 | setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); | |||
| 1792 | setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); | |||
| 1793 | setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); | |||
| 1794 | setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); | |||
| 1795 | ||||
| 1796 | if (HasBWI) { | |||
| 1797 | // Extends from v64i1 masks to 512-bit vectors. | |||
| 1798 | setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); | |||
| 1799 | setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); | |||
| 1800 | setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); | |||
| 1801 | } | |||
| 1802 | ||||
| 1803 | for (auto VT : { MVT::v16f32, MVT::v8f64 }) { | |||
| 1804 | setOperationAction(ISD::FFLOOR, VT, Legal); | |||
| 1805 | setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); | |||
| 1806 | setOperationAction(ISD::FCEIL, VT, Legal); | |||
| 1807 | setOperationAction(ISD::STRICT_FCEIL, VT, Legal); | |||
| 1808 | setOperationAction(ISD::FTRUNC, VT, Legal); | |||
| 1809 | setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); | |||
| 1810 | setOperationAction(ISD::FRINT, VT, Legal); | |||
| 1811 | setOperationAction(ISD::STRICT_FRINT, VT, Legal); | |||
| 1812 | setOperationAction(ISD::FNEARBYINT, VT, Legal); | |||
| 1813 | setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); | |||
| 1814 | setOperationAction(ISD::FROUNDEVEN, VT, Legal); | |||
| 1815 | setOperationAction(ISD::STRICT_FROUNDEVEN, VT, Legal); | |||
| 1816 | ||||
| 1817 | setOperationAction(ISD::FROUND, VT, Custom); | |||
| 1818 | } | |||
| 1819 | ||||
| 1820 | for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) { | |||
| 1821 | setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom); | |||
| 1822 | setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom); | |||
| 1823 | } | |||
| 1824 | ||||
| 1825 | setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1826 | setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1827 | setOperationAction(ISD::ADD, MVT::v64i8, HasBWI ? Legal : Custom); | |||
| 1828 | setOperationAction(ISD::SUB, MVT::v64i8, HasBWI ? Legal : Custom); | |||
| 1829 | ||||
| 1830 | setOperationAction(ISD::MUL, MVT::v8i64, Custom); | |||
| 1831 | setOperationAction(ISD::MUL, MVT::v16i32, Legal); | |||
| 1832 | setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1833 | setOperationAction(ISD::MUL, MVT::v64i8, Custom); | |||
| 1834 | ||||
| 1835 | setOperationAction(ISD::MULHU, MVT::v16i32, Custom); | |||
| 1836 | setOperationAction(ISD::MULHS, MVT::v16i32, Custom); | |||
| 1837 | setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1838 | setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1839 | setOperationAction(ISD::MULHS, MVT::v64i8, Custom); | |||
| 1840 | setOperationAction(ISD::MULHU, MVT::v64i8, Custom); | |||
| 1841 | setOperationAction(ISD::AVGCEILU, MVT::v32i16, HasBWI ? Legal : Custom); | |||
| 1842 | setOperationAction(ISD::AVGCEILU, MVT::v64i8, HasBWI ? Legal : Custom); | |||
| 1843 | ||||
| 1844 | setOperationAction(ISD::SMULO, MVT::v64i8, Custom); | |||
| 1845 | setOperationAction(ISD::UMULO, MVT::v64i8, Custom); | |||
| 1846 | ||||
| 1847 | setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); | |||
| 1848 | ||||
| 1849 | for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { | |||
| 1850 | setOperationAction(ISD::SRL, VT, Custom); | |||
| 1851 | setOperationAction(ISD::SHL, VT, Custom); | |||
| 1852 | setOperationAction(ISD::SRA, VT, Custom); | |||
| 1853 | setOperationAction(ISD::ROTL, VT, Custom); | |||
| 1854 | setOperationAction(ISD::ROTR, VT, Custom); | |||
| 1855 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 1856 | setOperationAction(ISD::ABDS, VT, Custom); | |||
| 1857 | setOperationAction(ISD::ABDU, VT, Custom); | |||
| 1858 | ||||
| 1859 | // The condition codes aren't legal in SSE/AVX and under AVX512 we use | |||
| 1860 | // setcc all the way to isel and prefer SETGT in some isel patterns. | |||
| 1861 | setCondCodeAction(ISD::SETLT, VT, Custom); | |||
| 1862 | setCondCodeAction(ISD::SETLE, VT, Custom); | |||
| 1863 | } | |||
| 1864 | for (auto VT : { MVT::v16i32, MVT::v8i64 }) { | |||
| 1865 | setOperationAction(ISD::SMAX, VT, Legal); | |||
| 1866 | setOperationAction(ISD::UMAX, VT, Legal); | |||
| 1867 | setOperationAction(ISD::SMIN, VT, Legal); | |||
| 1868 | setOperationAction(ISD::UMIN, VT, Legal); | |||
| 1869 | setOperationAction(ISD::ABS, VT, Legal); | |||
| 1870 | setOperationAction(ISD::CTPOP, VT, Custom); | |||
| 1871 | setOperationAction(ISD::STRICT_FSETCC, VT, Custom); | |||
| 1872 | setOperationAction(ISD::STRICT_FSETCCS, VT, Custom); | |||
| 1873 | } | |||
| 1874 | ||||
| 1875 | for (auto VT : { MVT::v64i8, MVT::v32i16 }) { | |||
| 1876 | setOperationAction(ISD::ABS, VT, HasBWI ? Legal : Custom); | |||
| 1877 | setOperationAction(ISD::CTPOP, VT, Subtarget.hasBITALG() ? Legal : Custom); | |||
| 1878 | setOperationAction(ISD::CTLZ, VT, Custom); | |||
| 1879 | setOperationAction(ISD::SMAX, VT, HasBWI ? Legal : Custom); | |||
| 1880 | setOperationAction(ISD::UMAX, VT, HasBWI ? Legal : Custom); | |||
| 1881 | setOperationAction(ISD::SMIN, VT, HasBWI ? Legal : Custom); | |||
| 1882 | setOperationAction(ISD::UMIN, VT, HasBWI ? Legal : Custom); | |||
| 1883 | setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom); | |||
| 1884 | setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom); | |||
| 1885 | setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom); | |||
| 1886 | setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom); | |||
| 1887 | } | |||
| 1888 | ||||
| 1889 | setOperationAction(ISD::FSHL, MVT::v64i8, Custom); | |||
| 1890 | setOperationAction(ISD::FSHR, MVT::v64i8, Custom); | |||
| 1891 | setOperationAction(ISD::FSHL, MVT::v32i16, Custom); | |||
| 1892 | setOperationAction(ISD::FSHR, MVT::v32i16, Custom); | |||
| 1893 | setOperationAction(ISD::FSHL, MVT::v16i32, Custom); | |||
| 1894 | setOperationAction(ISD::FSHR, MVT::v16i32, Custom); | |||
| 1895 | ||||
| 1896 | if (Subtarget.hasDQI()) { | |||
| 1897 | for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, | |||
| 1898 | ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, | |||
| 1899 | ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) | |||
| 1900 | setOperationAction(Opc, MVT::v8i64, Custom); | |||
| 1901 | setOperationAction(ISD::MUL, MVT::v8i64, Legal); | |||
| 1902 | } | |||
| 1903 | ||||
| 1904 | if (Subtarget.hasCDI()) { | |||
| 1905 | // NonVLX sub-targets extend 128/256 vectors to use the 512 version. | |||
| 1906 | for (auto VT : { MVT::v16i32, MVT::v8i64} ) { | |||
| 1907 | setOperationAction(ISD::CTLZ, VT, Legal); | |||
| 1908 | } | |||
| 1909 | } // Subtarget.hasCDI() | |||
| 1910 | ||||
| 1911 | if (Subtarget.hasVPOPCNTDQ()) { | |||
| 1912 | for (auto VT : { MVT::v16i32, MVT::v8i64 }) | |||
| 1913 | setOperationAction(ISD::CTPOP, VT, Legal); | |||
| 1914 | } | |||
| 1915 | ||||
| 1916 | // Extract subvector is special because the value type | |||
| 1917 | // (result) is 256-bit but the source is 512-bit wide. | |||
| 1918 | // 128-bit was made Legal under AVX1. | |||
| 1919 | for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, | |||
| 1920 | MVT::v16f16, MVT::v8f32, MVT::v4f64 }) | |||
| 1921 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); | |||
| 1922 | ||||
| 1923 | for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64, | |||
| 1924 | MVT::v32f16, MVT::v16f32, MVT::v8f64 }) { | |||
| 1925 | setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); | |||
| 1926 | setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); | |||
| 1927 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 1928 | setOperationAction(ISD::VSELECT, VT, Custom); | |||
| 1929 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 1930 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 1931 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 1932 | setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); | |||
| 1933 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); | |||
| 1934 | } | |||
| 1935 | setF16Action(MVT::v32f16, Expand); | |||
| 1936 | setOperationAction(ISD::FP_ROUND, MVT::v16f16, Custom); | |||
| 1937 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Custom); | |||
| 1938 | setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); | |||
| 1939 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); | |||
| 1940 | for (unsigned Opc : {ISD::FADD, ISD::FSUB, ISD::FMUL, ISD::FDIV}) { | |||
| 1941 | setOperationPromotedToType(Opc, MVT::v16f16, MVT::v16f32); | |||
| 1942 | setOperationPromotedToType(Opc, MVT::v32f16, MVT::v32f32); | |||
| 1943 | } | |||
| 1944 | ||||
| 1945 | for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { | |||
| 1946 | setOperationAction(ISD::MLOAD, VT, Legal); | |||
| 1947 | setOperationAction(ISD::MSTORE, VT, Legal); | |||
| 1948 | setOperationAction(ISD::MGATHER, VT, Custom); | |||
| 1949 | setOperationAction(ISD::MSCATTER, VT, Custom); | |||
| 1950 | } | |||
| 1951 | if (HasBWI) { | |||
| 1952 | for (auto VT : { MVT::v64i8, MVT::v32i16 }) { | |||
| 1953 | setOperationAction(ISD::MLOAD, VT, Legal); | |||
| 1954 | setOperationAction(ISD::MSTORE, VT, Legal); | |||
| 1955 | } | |||
| 1956 | } else { | |||
| 1957 | setOperationAction(ISD::STORE, MVT::v32i16, Custom); | |||
| 1958 | setOperationAction(ISD::STORE, MVT::v64i8, Custom); | |||
| 1959 | } | |||
| 1960 | ||||
| 1961 | if (Subtarget.hasVBMI2()) { | |||
| 1962 | for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64, | |||
| 1963 | MVT::v16i16, MVT::v8i32, MVT::v4i64, | |||
| 1964 | MVT::v32i16, MVT::v16i32, MVT::v8i64 }) { | |||
| 1965 | setOperationAction(ISD::FSHL, VT, Custom); | |||
| 1966 | setOperationAction(ISD::FSHR, VT, Custom); | |||
| 1967 | } | |||
| 1968 | ||||
| 1969 | setOperationAction(ISD::ROTL, MVT::v32i16, Custom); | |||
| 1970 | setOperationAction(ISD::ROTR, MVT::v8i16, Custom); | |||
| 1971 | setOperationAction(ISD::ROTR, MVT::v16i16, Custom); | |||
| 1972 | setOperationAction(ISD::ROTR, MVT::v32i16, Custom); | |||
| 1973 | } | |||
| 1974 | }// useAVX512Regs | |||
| 1975 | ||||
| 1976 | // This block controls legalization for operations that don't have | |||
| 1977 | // pre-AVX512 equivalents. Without VLX we use 512-bit operations for | |||
| 1978 | // narrower widths. | |||
| 1979 | if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { | |||
| 1980 | // These operations are handled on non-VLX by artificially widening in | |||
| 1981 | // isel patterns. | |||
| 1982 | ||||
| 1983 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32, Custom); | |||
| 1984 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32, Custom); | |||
| 1985 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom); | |||
| 1986 | ||||
| 1987 | if (Subtarget.hasDQI()) { | |||
| 1988 | // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. | |||
| 1989 | // v2f32 UINT_TO_FP is already custom under SSE2. | |||
| 1990 | assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP , MVT::v2f32) && "Unexpected operation action!") ? void (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__ __PRETTY_FUNCTION__)) | |||
| 1991 | isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP , MVT::v2f32) && "Unexpected operation action!") ? void (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__ __PRETTY_FUNCTION__)) | |||
| 1992 | "Unexpected operation action!")(static_cast <bool> (isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP , MVT::v2f32) && "Unexpected operation action!") ? void (0) : __assert_fail ("isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) && \"Unexpected operation action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 1992, __extension__ __PRETTY_FUNCTION__)); | |||
| 1993 | // v2i64 FP_TO_S/UINT(v2f32) custom conversion. | |||
| 1994 | setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); | |||
| 1995 | setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); | |||
| 1996 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom); | |||
| 1997 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom); | |||
| 1998 | } | |||
| 1999 | ||||
| 2000 | for (auto VT : { MVT::v2i64, MVT::v4i64 }) { | |||
| 2001 | setOperationAction(ISD::SMAX, VT, Legal); | |||
| 2002 | setOperationAction(ISD::UMAX, VT, Legal); | |||
| 2003 | setOperationAction(ISD::SMIN, VT, Legal); | |||
| 2004 | setOperationAction(ISD::UMIN, VT, Legal); | |||
| 2005 | setOperationAction(ISD::ABS, VT, Legal); | |||
| 2006 | } | |||
| 2007 | ||||
| 2008 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { | |||
| 2009 | setOperationAction(ISD::ROTL, VT, Custom); | |||
| 2010 | setOperationAction(ISD::ROTR, VT, Custom); | |||
| 2011 | } | |||
| 2012 | ||||
| 2013 | // Custom legalize 2x32 to get a little better code. | |||
| 2014 | setOperationAction(ISD::MSCATTER, MVT::v2f32, Custom); | |||
| 2015 | setOperationAction(ISD::MSCATTER, MVT::v2i32, Custom); | |||
| 2016 | ||||
| 2017 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, | |||
| 2018 | MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) | |||
| 2019 | setOperationAction(ISD::MSCATTER, VT, Custom); | |||
| 2020 | ||||
| 2021 | if (Subtarget.hasDQI()) { | |||
| 2022 | for (auto Opc : {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::STRICT_SINT_TO_FP, | |||
| 2023 | ISD::STRICT_UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT, | |||
| 2024 | ISD::STRICT_FP_TO_SINT, ISD::STRICT_FP_TO_UINT}) { | |||
| 2025 | setOperationAction(Opc, MVT::v2i64, Custom); | |||
| 2026 | setOperationAction(Opc, MVT::v4i64, Custom); | |||
| 2027 | } | |||
| 2028 | setOperationAction(ISD::MUL, MVT::v2i64, Legal); | |||
| 2029 | setOperationAction(ISD::MUL, MVT::v4i64, Legal); | |||
| 2030 | } | |||
| 2031 | ||||
| 2032 | if (Subtarget.hasCDI()) { | |||
| 2033 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { | |||
| 2034 | setOperationAction(ISD::CTLZ, VT, Legal); | |||
| 2035 | } | |||
| 2036 | } // Subtarget.hasCDI() | |||
| 2037 | ||||
| 2038 | if (Subtarget.hasVPOPCNTDQ()) { | |||
| 2039 | for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) | |||
| 2040 | setOperationAction(ISD::CTPOP, VT, Legal); | |||
| 2041 | } | |||
| 2042 | } | |||
| 2043 | ||||
| 2044 | // This block control legalization of v32i1/v64i1 which are available with | |||
| 2045 | // AVX512BW.. | |||
| 2046 | if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { | |||
| 2047 | addRegisterClass(MVT::v32i1, &X86::VK32RegClass); | |||
| 2048 | addRegisterClass(MVT::v64i1, &X86::VK64RegClass); | |||
| 2049 | ||||
| 2050 | for (auto VT : { MVT::v32i1, MVT::v64i1 }) { | |||
| 2051 | setOperationAction(ISD::VSELECT, VT, Expand); | |||
| 2052 | setOperationAction(ISD::TRUNCATE, VT, Custom); | |||
| 2053 | setOperationAction(ISD::SETCC, VT, Custom); | |||
| 2054 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 2055 | setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); | |||
| 2056 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 2057 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 2058 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 2059 | setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); | |||
| 2060 | setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); | |||
| 2061 | } | |||
| 2062 | ||||
| 2063 | for (auto VT : { MVT::v16i1, MVT::v32i1 }) | |||
| 2064 | setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); | |||
| 2065 | ||||
| 2066 | // Extends from v32i1 masks to 256-bit vectors. | |||
| 2067 | setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); | |||
| 2068 | setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); | |||
| 2069 | setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); | |||
| 2070 | ||||
| 2071 | for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { | |||
| 2072 | setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); | |||
| 2073 | setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); | |||
| 2074 | } | |||
| 2075 | ||||
| 2076 | // These operations are handled on non-VLX by artificially widening in | |||
| 2077 | // isel patterns. | |||
| 2078 | // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? | |||
| 2079 | ||||
| 2080 | if (Subtarget.hasBITALG()) { | |||
| 2081 | for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) | |||
| 2082 | setOperationAction(ISD::CTPOP, VT, Legal); | |||
| 2083 | } | |||
| 2084 | } | |||
| 2085 | ||||
| 2086 | if (!Subtarget.useSoftFloat() && Subtarget.hasFP16()) { | |||
| 2087 | auto setGroup = [&] (MVT VT) { | |||
| 2088 | setOperationAction(ISD::FADD, VT, Legal); | |||
| 2089 | setOperationAction(ISD::STRICT_FADD, VT, Legal); | |||
| 2090 | setOperationAction(ISD::FSUB, VT, Legal); | |||
| 2091 | setOperationAction(ISD::STRICT_FSUB, VT, Legal); | |||
| 2092 | setOperationAction(ISD::FMUL, VT, Legal); | |||
| 2093 | setOperationAction(ISD::STRICT_FMUL, VT, Legal); | |||
| 2094 | setOperationAction(ISD::FDIV, VT, Legal); | |||
| 2095 | setOperationAction(ISD::STRICT_FDIV, VT, Legal); | |||
| 2096 | setOperationAction(ISD::FSQRT, VT, Legal); | |||
| 2097 | setOperationAction(ISD::STRICT_FSQRT, VT, Legal); | |||
| 2098 | ||||
| 2099 | setOperationAction(ISD::FFLOOR, VT, Legal); | |||
| 2100 | setOperationAction(ISD::STRICT_FFLOOR, VT, Legal); | |||
| 2101 | setOperationAction(ISD::FCEIL, VT, Legal); | |||
| 2102 | setOperationAction(ISD::STRICT_FCEIL, VT, Legal); | |||
| 2103 | setOperationAction(ISD::FTRUNC, VT, Legal); | |||
| 2104 | setOperationAction(ISD::STRICT_FTRUNC, VT, Legal); | |||
| 2105 | setOperationAction(ISD::FRINT, VT, Legal); | |||
| 2106 | setOperationAction(ISD::STRICT_FRINT, VT, Legal); | |||
| 2107 | setOperationAction(ISD::FNEARBYINT, VT, Legal); | |||
| 2108 | setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal); | |||
| 2109 | ||||
| 2110 | setOperationAction(ISD::FROUND, VT, Custom); | |||
| 2111 | ||||
| 2112 | setOperationAction(ISD::LOAD, VT, Legal); | |||
| 2113 | setOperationAction(ISD::STORE, VT, Legal); | |||
| 2114 | ||||
| 2115 | setOperationAction(ISD::FMA, VT, Legal); | |||
| 2116 | setOperationAction(ISD::STRICT_FMA, VT, Legal); | |||
| 2117 | setOperationAction(ISD::VSELECT, VT, Legal); | |||
| 2118 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 2119 | setOperationAction(ISD::SELECT, VT, Custom); | |||
| 2120 | ||||
| 2121 | setOperationAction(ISD::FNEG, VT, Custom); | |||
| 2122 | setOperationAction(ISD::FABS, VT, Custom); | |||
| 2123 | setOperationAction(ISD::FCOPYSIGN, VT, Custom); | |||
| 2124 | setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); | |||
| 2125 | setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); | |||
| 2126 | }; | |||
| 2127 | ||||
| 2128 | // AVX512_FP16 scalar operations | |||
| 2129 | setGroup(MVT::f16); | |||
| 2130 | setOperationAction(ISD::FREM, MVT::f16, Promote); | |||
| 2131 | setOperationAction(ISD::STRICT_FREM, MVT::f16, Promote); | |||
| 2132 | setOperationAction(ISD::SELECT_CC, MVT::f16, Expand); | |||
| 2133 | setOperationAction(ISD::BR_CC, MVT::f16, Expand); | |||
| 2134 | setOperationAction(ISD::SETCC, MVT::f16, Custom); | |||
| 2135 | setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom); | |||
| 2136 | setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom); | |||
| 2137 | setOperationAction(ISD::STRICT_FROUND, MVT::f16, Promote); | |||
| 2138 | setOperationAction(ISD::FROUNDEVEN, MVT::f16, Legal); | |||
| 2139 | setOperationAction(ISD::STRICT_FROUNDEVEN, MVT::f16, Legal); | |||
| 2140 | setOperationAction(ISD::FP_ROUND, MVT::f16, Custom); | |||
| 2141 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::f16, Custom); | |||
| 2142 | setOperationAction(ISD::FMAXIMUM, MVT::f16, Custom); | |||
| 2143 | setOperationAction(ISD::FMINIMUM, MVT::f16, Custom); | |||
| 2144 | setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); | |||
| 2145 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); | |||
| 2146 | ||||
| 2147 | setCondCodeAction(ISD::SETOEQ, MVT::f16, Expand); | |||
| 2148 | setCondCodeAction(ISD::SETUNE, MVT::f16, Expand); | |||
| 2149 | ||||
| 2150 | if (Subtarget.useAVX512Regs()) { | |||
| 2151 | setGroup(MVT::v32f16); | |||
| 2152 | setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32f16, Custom); | |||
| 2153 | setOperationAction(ISD::SINT_TO_FP, MVT::v32i16, Legal); | |||
| 2154 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v32i16, Legal); | |||
| 2155 | setOperationAction(ISD::UINT_TO_FP, MVT::v32i16, Legal); | |||
| 2156 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v32i16, Legal); | |||
| 2157 | setOperationAction(ISD::FP_ROUND, MVT::v16f16, Legal); | |||
| 2158 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f16, Legal); | |||
| 2159 | setOperationAction(ISD::FP_EXTEND, MVT::v16f32, Legal); | |||
| 2160 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v16f32, Legal); | |||
| 2161 | setOperationAction(ISD::FP_EXTEND, MVT::v8f64, Legal); | |||
| 2162 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal); | |||
| 2163 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32f16, Custom); | |||
| 2164 | ||||
| 2165 | setOperationAction(ISD::FP_TO_SINT, MVT::v32i16, Custom); | |||
| 2166 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v32i16, Custom); | |||
| 2167 | setOperationAction(ISD::FP_TO_UINT, MVT::v32i16, Custom); | |||
| 2168 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v32i16, Custom); | |||
| 2169 | setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i8, MVT::v32i16); | |||
| 2170 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i8, | |||
| 2171 | MVT::v32i16); | |||
| 2172 | setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i8, MVT::v32i16); | |||
| 2173 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i8, | |||
| 2174 | MVT::v32i16); | |||
| 2175 | setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v32i1, MVT::v32i16); | |||
| 2176 | setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v32i1, | |||
| 2177 | MVT::v32i16); | |||
| 2178 | setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v32i1, MVT::v32i16); | |||
| 2179 | setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v32i1, | |||
| 2180 | MVT::v32i16); | |||
| 2181 | ||||
| 2182 | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f16, Legal); | |||
| 2183 | setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32f16, Legal); | |||
| 2184 | setOperationAction(ISD::CONCAT_VECTORS, MVT::v32f16, Custom); | |||
| 2185 | ||||
| 2186 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Legal); | |||
| 2187 | setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Legal); | |||
| 2188 | ||||
| 2189 | setOperationAction(ISD::STRICT_FSETCC, MVT::v32i1, Custom); | |||
| 2190 | setOperationAction(ISD::STRICT_FSETCCS, MVT::v32i1, Custom); | |||
| 2191 | } | |||
| 2192 | ||||
| 2193 | if (Subtarget.hasVLX()) { | |||
| 2194 | setGroup(MVT::v8f16); | |||
| 2195 | setGroup(MVT::v16f16); | |||
| 2196 | ||||
| 2197 | setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8f16, Legal); | |||
| 2198 | setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16f16, Custom); | |||
| 2199 | setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Legal); | |||
| 2200 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i16, Legal); | |||
| 2201 | setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Legal); | |||
| 2202 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i16, Legal); | |||
| 2203 | setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Legal); | |||
| 2204 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i16, Legal); | |||
| 2205 | setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Legal); | |||
| 2206 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i16, Legal); | |||
| 2207 | ||||
| 2208 | setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom); | |||
| 2209 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i16, Custom); | |||
| 2210 | setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Custom); | |||
| 2211 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i16, Custom); | |||
| 2212 | setOperationAction(ISD::FP_ROUND, MVT::v8f16, Legal); | |||
| 2213 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f16, Legal); | |||
| 2214 | setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); | |||
| 2215 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f32, Legal); | |||
| 2216 | setOperationAction(ISD::FP_EXTEND, MVT::v4f64, Legal); | |||
| 2217 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal); | |||
| 2218 | ||||
| 2219 | // INSERT_VECTOR_ELT v8f16 extended to VECTOR_SHUFFLE | |||
| 2220 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8f16, Custom); | |||
| 2221 | setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16f16, Custom); | |||
| 2222 | ||||
| 2223 | setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f16, Legal); | |||
| 2224 | setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16f16, Legal); | |||
| 2225 | setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f16, Custom); | |||
| 2226 | ||||
| 2227 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Legal); | |||
| 2228 | setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Legal); | |||
| 2229 | setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Legal); | |||
| 2230 | setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Legal); | |||
| 2231 | ||||
| 2232 | // Need to custom widen these to prevent scalarization. | |||
| 2233 | setOperationAction(ISD::LOAD, MVT::v4f16, Custom); | |||
| 2234 | setOperationAction(ISD::STORE, MVT::v4f16, Custom); | |||
| 2235 | } | |||
| 2236 | } | |||
| 2237 | ||||
| 2238 | if (!Subtarget.useSoftFloat() && | |||
| 2239 | (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) { | |||
| 2240 | addRegisterClass(MVT::v8bf16, &X86::VR128XRegClass); | |||
| 2241 | addRegisterClass(MVT::v16bf16, &X86::VR256XRegClass); | |||
| 2242 | // We set the type action of bf16 to TypeSoftPromoteHalf, but we don't | |||
| 2243 | // provide the method to promote BUILD_VECTOR. Set the operation action | |||
| 2244 | // Custom to do the customization later. | |||
| 2245 | setOperationAction(ISD::BUILD_VECTOR, MVT::bf16, Custom); | |||
| 2246 | for (auto VT : {MVT::v8bf16, MVT::v16bf16}) { | |||
| 2247 | setF16Action(VT, Expand); | |||
| 2248 | setOperationAction(ISD::FADD, VT, Expand); | |||
| 2249 | setOperationAction(ISD::FSUB, VT, Expand); | |||
| 2250 | setOperationAction(ISD::FMUL, VT, Expand); | |||
| 2251 | setOperationAction(ISD::FDIV, VT, Expand); | |||
| 2252 | setOperationAction(ISD::BUILD_VECTOR, VT, Custom); | |||
| 2253 | } | |||
| 2254 | addLegalFPImmediate(APFloat::getZero(APFloat::BFloat())); | |||
| 2255 | } | |||
| 2256 | ||||
| 2257 | if (!Subtarget.useSoftFloat() && Subtarget.hasBF16()) { | |||
| 2258 | addRegisterClass(MVT::v32bf16, &X86::VR512RegClass); | |||
| 2259 | setF16Action(MVT::v32bf16, Expand); | |||
| 2260 | setOperationAction(ISD::FADD, MVT::v32bf16, Expand); | |||
| 2261 | setOperationAction(ISD::FSUB, MVT::v32bf16, Expand); | |||
| 2262 | setOperationAction(ISD::FMUL, MVT::v32bf16, Expand); | |||
| 2263 | setOperationAction(ISD::FDIV, MVT::v32bf16, Expand); | |||
| 2264 | setOperationAction(ISD::BUILD_VECTOR, MVT::v32bf16, Custom); | |||
| 2265 | } | |||
| 2266 | ||||
| 2267 | if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) { | |||
| 2268 | setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); | |||
| 2269 | setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); | |||
| 2270 | setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); | |||
| 2271 | setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); | |||
| 2272 | setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); | |||
| 2273 | ||||
| 2274 | setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); | |||
| 2275 | setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); | |||
| 2276 | setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); | |||
| 2277 | setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); | |||
| 2278 | setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); | |||
| 2279 | ||||
| 2280 | if (Subtarget.hasBWI()) { | |||
| 2281 | setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); | |||
| 2282 | setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); | |||
| 2283 | } | |||
| 2284 | ||||
| 2285 | if (Subtarget.hasFP16()) { | |||
| 2286 | // vcvttph2[u]dq v4f16 -> v4i32/64, v2f16 -> v2i32/64 | |||
| 2287 | setOperationAction(ISD::FP_TO_SINT, MVT::v2f16, Custom); | |||
| 2288 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f16, Custom); | |||
| 2289 | setOperationAction(ISD::FP_TO_UINT, MVT::v2f16, Custom); | |||
| 2290 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f16, Custom); | |||
| 2291 | setOperationAction(ISD::FP_TO_SINT, MVT::v4f16, Custom); | |||
| 2292 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4f16, Custom); | |||
| 2293 | setOperationAction(ISD::FP_TO_UINT, MVT::v4f16, Custom); | |||
| 2294 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4f16, Custom); | |||
| 2295 | // vcvt[u]dq2ph v4i32/64 -> v4f16, v2i32/64 -> v2f16 | |||
| 2296 | setOperationAction(ISD::SINT_TO_FP, MVT::v2f16, Custom); | |||
| 2297 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f16, Custom); | |||
| 2298 | setOperationAction(ISD::UINT_TO_FP, MVT::v2f16, Custom); | |||
| 2299 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f16, Custom); | |||
| 2300 | setOperationAction(ISD::SINT_TO_FP, MVT::v4f16, Custom); | |||
| 2301 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4f16, Custom); | |||
| 2302 | setOperationAction(ISD::UINT_TO_FP, MVT::v4f16, Custom); | |||
| 2303 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4f16, Custom); | |||
| 2304 | // vcvtps2phx v4f32 -> v4f16, v2f32 -> v2f16 | |||
| 2305 | setOperationAction(ISD::FP_ROUND, MVT::v2f16, Custom); | |||
| 2306 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f16, Custom); | |||
| 2307 | setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom); | |||
| 2308 | setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f16, Custom); | |||
| 2309 | // vcvtph2psx v4f16 -> v4f32, v2f16 -> v2f32 | |||
| 2310 | setOperationAction(ISD::FP_EXTEND, MVT::v2f16, Custom); | |||
| 2311 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f16, Custom); | |||
| 2312 | setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Custom); | |||
| 2313 | setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f16, Custom); | |||
| 2314 | } | |||
| 2315 | ||||
| 2316 | setOperationAction(ISD::TRUNCATE, MVT::v16i32, Custom); | |||
| 2317 | setOperationAction(ISD::TRUNCATE, MVT::v8i64, Custom); | |||
| 2318 | setOperationAction(ISD::TRUNCATE, MVT::v16i64, Custom); | |||
| 2319 | } | |||
| 2320 | ||||
| 2321 | if (Subtarget.hasAMXTILE()) { | |||
| 2322 | addRegisterClass(MVT::x86amx, &X86::TILERegClass); | |||
| 2323 | } | |||
| 2324 | ||||
| 2325 | // We want to custom lower some of our intrinsics. | |||
| 2326 | setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); | |||
| 2327 | setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); | |||
| 2328 | setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); | |||
| 2329 | if (!Subtarget.is64Bit()) { | |||
| 2330 | setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); | |||
| 2331 | } | |||
| 2332 | ||||
| 2333 | // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't | |||
| 2334 | // handle type legalization for these operations here. | |||
| 2335 | // | |||
| 2336 | // FIXME: We really should do custom legalization for addition and | |||
| 2337 | // subtraction on x86-32 once PR3203 is fixed. We really can't do much better | |||
| 2338 | // than generic legalization for 64-bit multiplication-with-overflow, though. | |||
| 2339 | for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) { | |||
| 2340 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 2341 | continue; | |||
| 2342 | // Add/Sub/Mul with overflow operations are custom lowered. | |||
| 2343 | setOperationAction(ISD::SADDO, VT, Custom); | |||
| 2344 | setOperationAction(ISD::UADDO, VT, Custom); | |||
| 2345 | setOperationAction(ISD::SSUBO, VT, Custom); | |||
| 2346 | setOperationAction(ISD::USUBO, VT, Custom); | |||
| 2347 | setOperationAction(ISD::SMULO, VT, Custom); | |||
| 2348 | setOperationAction(ISD::UMULO, VT, Custom); | |||
| 2349 | ||||
| 2350 | // Support carry in as value rather than glue. | |||
| 2351 | setOperationAction(ISD::UADDO_CARRY, VT, Custom); | |||
| 2352 | setOperationAction(ISD::USUBO_CARRY, VT, Custom); | |||
| 2353 | setOperationAction(ISD::SETCCCARRY, VT, Custom); | |||
| 2354 | setOperationAction(ISD::SADDO_CARRY, VT, Custom); | |||
| 2355 | setOperationAction(ISD::SSUBO_CARRY, VT, Custom); | |||
| 2356 | } | |||
| 2357 | ||||
| 2358 | if (!Subtarget.is64Bit()) { | |||
| 2359 | // These libcalls are not available in 32-bit. | |||
| 2360 | setLibcallName(RTLIB::SHL_I128, nullptr); | |||
| 2361 | setLibcallName(RTLIB::SRL_I128, nullptr); | |||
| 2362 | setLibcallName(RTLIB::SRA_I128, nullptr); | |||
| 2363 | setLibcallName(RTLIB::MUL_I128, nullptr); | |||
| 2364 | // The MULO libcall is not part of libgcc, only compiler-rt. | |||
| 2365 | setLibcallName(RTLIB::MULO_I64, nullptr); | |||
| 2366 | } | |||
| 2367 | // The MULO libcall is not part of libgcc, only compiler-rt. | |||
| 2368 | setLibcallName(RTLIB::MULO_I128, nullptr); | |||
| 2369 | ||||
| 2370 | // Combine sin / cos into _sincos_stret if it is available. | |||
| 2371 | if (getLibcallName(RTLIB::SINCOS_STRET_F32) != nullptr && | |||
| 2372 | getLibcallName(RTLIB::SINCOS_STRET_F64) != nullptr) { | |||
| 2373 | setOperationAction(ISD::FSINCOS, MVT::f64, Custom); | |||
| 2374 | setOperationAction(ISD::FSINCOS, MVT::f32, Custom); | |||
| 2375 | } | |||
| 2376 | ||||
| 2377 | if (Subtarget.isTargetWin64()) { | |||
| 2378 | setOperationAction(ISD::SDIV, MVT::i128, Custom); | |||
| 2379 | setOperationAction(ISD::UDIV, MVT::i128, Custom); | |||
| 2380 | setOperationAction(ISD::SREM, MVT::i128, Custom); | |||
| 2381 | setOperationAction(ISD::UREM, MVT::i128, Custom); | |||
| 2382 | setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom); | |||
| 2383 | setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom); | |||
| 2384 | setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom); | |||
| 2385 | setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom); | |||
| 2386 | setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom); | |||
| 2387 | setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom); | |||
| 2388 | setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom); | |||
| 2389 | setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom); | |||
| 2390 | } | |||
| 2391 | ||||
| 2392 | // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)` | |||
| 2393 | // is. We should promote the value to 64-bits to solve this. | |||
| 2394 | // This is what the CRT headers do - `fmodf` is an inline header | |||
| 2395 | // function casting to f64 and calling `fmod`. | |||
| 2396 | if (Subtarget.is32Bit() && | |||
| 2397 | (Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium())) | |||
| 2398 | for (ISD::NodeType Op : | |||
| 2399 | {ISD::FCEIL, ISD::STRICT_FCEIL, | |||
| 2400 | ISD::FCOS, ISD::STRICT_FCOS, | |||
| 2401 | ISD::FEXP, ISD::STRICT_FEXP, | |||
| 2402 | ISD::FFLOOR, ISD::STRICT_FFLOOR, | |||
| 2403 | ISD::FREM, ISD::STRICT_FREM, | |||
| 2404 | ISD::FLOG, ISD::STRICT_FLOG, | |||
| 2405 | ISD::FLOG10, ISD::STRICT_FLOG10, | |||
| 2406 | ISD::FPOW, ISD::STRICT_FPOW, | |||
| 2407 | ISD::FSIN, ISD::STRICT_FSIN}) | |||
| 2408 | if (isOperationExpand(Op, MVT::f32)) | |||
| 2409 | setOperationAction(Op, MVT::f32, Promote); | |||
| 2410 | ||||
| 2411 | // We have target-specific dag combine patterns for the following nodes: | |||
| 2412 | setTargetDAGCombine({ISD::VECTOR_SHUFFLE, | |||
| 2413 | ISD::SCALAR_TO_VECTOR, | |||
| 2414 | ISD::INSERT_VECTOR_ELT, | |||
| 2415 | ISD::EXTRACT_VECTOR_ELT, | |||
| 2416 | ISD::CONCAT_VECTORS, | |||
| 2417 | ISD::INSERT_SUBVECTOR, | |||
| 2418 | ISD::EXTRACT_SUBVECTOR, | |||
| 2419 | ISD::BITCAST, | |||
| 2420 | ISD::VSELECT, | |||
| 2421 | ISD::SELECT, | |||
| 2422 | ISD::SHL, | |||
| 2423 | ISD::SRA, | |||
| 2424 | ISD::SRL, | |||
| 2425 | ISD::OR, | |||
| 2426 | ISD::AND, | |||
| 2427 | ISD::ADD, | |||
| 2428 | ISD::FADD, | |||
| 2429 | ISD::FSUB, | |||
| 2430 | ISD::FNEG, | |||
| 2431 | ISD::FMA, | |||
| 2432 | ISD::STRICT_FMA, | |||
| 2433 | ISD::FMINNUM, | |||
| 2434 | ISD::FMAXNUM, | |||
| 2435 | ISD::SUB, | |||
| 2436 | ISD::LOAD, | |||
| 2437 | ISD::MLOAD, | |||
| 2438 | ISD::STORE, | |||
| 2439 | ISD::MSTORE, | |||
| 2440 | ISD::TRUNCATE, | |||
| 2441 | ISD::ZERO_EXTEND, | |||
| 2442 | ISD::ANY_EXTEND, | |||
| 2443 | ISD::SIGN_EXTEND, | |||
| 2444 | ISD::SIGN_EXTEND_INREG, | |||
| 2445 | ISD::ANY_EXTEND_VECTOR_INREG, | |||
| 2446 | ISD::SIGN_EXTEND_VECTOR_INREG, | |||
| 2447 | ISD::ZERO_EXTEND_VECTOR_INREG, | |||
| 2448 | ISD::SINT_TO_FP, | |||
| 2449 | ISD::UINT_TO_FP, | |||
| 2450 | ISD::STRICT_SINT_TO_FP, | |||
| 2451 | ISD::STRICT_UINT_TO_FP, | |||
| 2452 | ISD::SETCC, | |||
| 2453 | ISD::MUL, | |||
| 2454 | ISD::XOR, | |||
| 2455 | ISD::MSCATTER, | |||
| 2456 | ISD::MGATHER, | |||
| 2457 | ISD::FP16_TO_FP, | |||
| 2458 | ISD::FP_EXTEND, | |||
| 2459 | ISD::STRICT_FP_EXTEND, | |||
| 2460 | ISD::FP_ROUND, | |||
| 2461 | ISD::STRICT_FP_ROUND}); | |||
| 2462 | ||||
| 2463 | computeRegisterProperties(Subtarget.getRegisterInfo()); | |||
| 2464 | ||||
| 2465 | MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores | |||
| 2466 | MaxStoresPerMemsetOptSize = 8; | |||
| 2467 | MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores | |||
| 2468 | MaxStoresPerMemcpyOptSize = 4; | |||
| 2469 | MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores | |||
| 2470 | MaxStoresPerMemmoveOptSize = 4; | |||
| 2471 | ||||
| 2472 | // TODO: These control memcmp expansion in CGP and could be raised higher, but | |||
| 2473 | // that needs to benchmarked and balanced with the potential use of vector | |||
| 2474 | // load/store types (PR33329, PR33914). | |||
| 2475 | MaxLoadsPerMemcmp = 2; | |||
| 2476 | MaxLoadsPerMemcmpOptSize = 2; | |||
| 2477 | ||||
| 2478 | // Default loop alignment, which can be overridden by -align-loops. | |||
| 2479 | setPrefLoopAlignment(Align(16)); | |||
| 2480 | ||||
| 2481 | // An out-of-order CPU can speculatively execute past a predictable branch, | |||
| 2482 | // but a conditional move could be stalled by an expensive earlier operation. | |||
| 2483 | PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder(); | |||
| 2484 | EnableExtLdPromotion = true; | |||
| 2485 | setPrefFunctionAlignment(Align(16)); | |||
| 2486 | ||||
| 2487 | verifyIntrinsicTables(); | |||
| 2488 | ||||
| 2489 | // Default to having -disable-strictnode-mutation on | |||
| 2490 | IsStrictFPEnabled = true; | |||
| 2491 | } | |||
| 2492 | ||||
| 2493 | // This has so far only been implemented for 64-bit MachO. | |||
| 2494 | bool X86TargetLowering::useLoadStackGuardNode() const { | |||
| 2495 | return Subtarget.isTargetMachO() && Subtarget.is64Bit(); | |||
| 2496 | } | |||
| 2497 | ||||
| 2498 | bool X86TargetLowering::useStackGuardXorFP() const { | |||
| 2499 | // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. | |||
| 2500 | return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO(); | |||
| 2501 | } | |||
| 2502 | ||||
| 2503 | SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, | |||
| 2504 | const SDLoc &DL) const { | |||
| 2505 | EVT PtrTy = getPointerTy(DAG.getDataLayout()); | |||
| 2506 | unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; | |||
| 2507 | MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); | |||
| 2508 | return SDValue(Node, 0); | |||
| 2509 | } | |||
| 2510 | ||||
| 2511 | TargetLoweringBase::LegalizeTypeAction | |||
| 2512 | X86TargetLowering::getPreferredVectorAction(MVT VT) const { | |||
| 2513 | if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() && | |||
| 2514 | !Subtarget.hasBWI()) | |||
| 2515 | return TypeSplitVector; | |||
| 2516 | ||||
| 2517 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && | |||
| 2518 | !Subtarget.hasF16C() && VT.getVectorElementType() == MVT::f16) | |||
| 2519 | return TypeSplitVector; | |||
| 2520 | ||||
| 2521 | if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 && | |||
| 2522 | VT.getVectorElementType() != MVT::i1) | |||
| 2523 | return TypeWidenVector; | |||
| 2524 | ||||
| 2525 | return TargetLoweringBase::getPreferredVectorAction(VT); | |||
| 2526 | } | |||
| 2527 | ||||
| 2528 | static std::pair<MVT, unsigned> | |||
| 2529 | handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC, | |||
| 2530 | const X86Subtarget &Subtarget) { | |||
| 2531 | // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling | |||
| 2532 | // convention is one that uses k registers. | |||
| 2533 | if (NumElts == 2) | |||
| 2534 | return {MVT::v2i64, 1}; | |||
| 2535 | if (NumElts == 4) | |||
| 2536 | return {MVT::v4i32, 1}; | |||
| 2537 | if (NumElts == 8 && CC != CallingConv::X86_RegCall && | |||
| 2538 | CC != CallingConv::Intel_OCL_BI) | |||
| 2539 | return {MVT::v8i16, 1}; | |||
| 2540 | if (NumElts == 16 && CC != CallingConv::X86_RegCall && | |||
| 2541 | CC != CallingConv::Intel_OCL_BI) | |||
| 2542 | return {MVT::v16i8, 1}; | |||
| 2543 | // v32i1 passes in ymm unless we have BWI and the calling convention is | |||
| 2544 | // regcall. | |||
| 2545 | if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall)) | |||
| 2546 | return {MVT::v32i8, 1}; | |||
| 2547 | // Split v64i1 vectors if we don't have v64i8 available. | |||
| 2548 | if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) { | |||
| 2549 | if (Subtarget.useAVX512Regs()) | |||
| 2550 | return {MVT::v64i8, 1}; | |||
| 2551 | return {MVT::v32i8, 2}; | |||
| 2552 | } | |||
| 2553 | ||||
| 2554 | // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. | |||
| 2555 | if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) || | |||
| 2556 | NumElts > 64) | |||
| 2557 | return {MVT::i8, NumElts}; | |||
| 2558 | ||||
| 2559 | return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0}; | |||
| 2560 | } | |||
| 2561 | ||||
| 2562 | MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, | |||
| 2563 | CallingConv::ID CC, | |||
| 2564 | EVT VT) const { | |||
| 2565 | if (VT.isVector()) { | |||
| 2566 | if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { | |||
| 2567 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 2568 | ||||
| 2569 | MVT RegisterVT; | |||
| 2570 | unsigned NumRegisters; | |||
| 2571 | std::tie(RegisterVT, NumRegisters) = | |||
| 2572 | handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); | |||
| 2573 | if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) | |||
| 2574 | return RegisterVT; | |||
| 2575 | } | |||
| 2576 | ||||
| 2577 | if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) | |||
| 2578 | return MVT::v8f16; | |||
| 2579 | } | |||
| 2580 | ||||
| 2581 | // We will use more GPRs for f64 and f80 on 32 bits when x87 is disabled. | |||
| 2582 | if ((VT == MVT::f64 || VT == MVT::f80) && !Subtarget.is64Bit() && | |||
| 2583 | !Subtarget.hasX87()) | |||
| 2584 | return MVT::i32; | |||
| 2585 | ||||
| 2586 | if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) | |||
| 2587 | return getRegisterTypeForCallingConv(Context, CC, | |||
| 2588 | VT.changeVectorElementTypeToInteger()); | |||
| 2589 | ||||
| 2590 | return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); | |||
| 2591 | } | |||
| 2592 | ||||
| 2593 | unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, | |||
| 2594 | CallingConv::ID CC, | |||
| 2595 | EVT VT) const { | |||
| 2596 | if (VT.isVector()) { | |||
| 2597 | if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) { | |||
| 2598 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 2599 | ||||
| 2600 | MVT RegisterVT; | |||
| 2601 | unsigned NumRegisters; | |||
| 2602 | std::tie(RegisterVT, NumRegisters) = | |||
| 2603 | handleMaskRegisterForCallingConv(NumElts, CC, Subtarget); | |||
| 2604 | if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE) | |||
| 2605 | return NumRegisters; | |||
| 2606 | } | |||
| 2607 | ||||
| 2608 | if (VT.getVectorElementType() == MVT::f16 && VT.getVectorNumElements() < 8) | |||
| 2609 | return 1; | |||
| 2610 | } | |||
| 2611 | ||||
| 2612 | // We have to split f64 to 2 registers and f80 to 3 registers on 32 bits if | |||
| 2613 | // x87 is disabled. | |||
| 2614 | if (!Subtarget.is64Bit() && !Subtarget.hasX87()) { | |||
| 2615 | if (VT == MVT::f64) | |||
| 2616 | return 2; | |||
| 2617 | if (VT == MVT::f80) | |||
| 2618 | return 3; | |||
| 2619 | } | |||
| 2620 | ||||
| 2621 | if (VT.isVector() && VT.getVectorElementType() == MVT::bf16) | |||
| 2622 | return getNumRegistersForCallingConv(Context, CC, | |||
| 2623 | VT.changeVectorElementTypeToInteger()); | |||
| 2624 | ||||
| 2625 | return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); | |||
| 2626 | } | |||
| 2627 | ||||
| 2628 | unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv( | |||
| 2629 | LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, | |||
| 2630 | unsigned &NumIntermediates, MVT &RegisterVT) const { | |||
| 2631 | // Break wide or odd vXi1 vectors into scalars to match avx2 behavior. | |||
| 2632 | if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && | |||
| 2633 | Subtarget.hasAVX512() && | |||
| 2634 | (!isPowerOf2_32(VT.getVectorNumElements()) || | |||
| 2635 | (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) || | |||
| 2636 | VT.getVectorNumElements() > 64)) { | |||
| 2637 | RegisterVT = MVT::i8; | |||
| 2638 | IntermediateVT = MVT::i1; | |||
| 2639 | NumIntermediates = VT.getVectorNumElements(); | |||
| 2640 | return NumIntermediates; | |||
| 2641 | } | |||
| 2642 | ||||
| 2643 | // Split v64i1 vectors if we don't have v64i8 available. | |||
| 2644 | if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && | |||
| 2645 | CC != CallingConv::X86_RegCall) { | |||
| 2646 | RegisterVT = MVT::v32i8; | |||
| 2647 | IntermediateVT = MVT::v32i1; | |||
| 2648 | NumIntermediates = 2; | |||
| 2649 | return 2; | |||
| 2650 | } | |||
| 2651 | ||||
| 2652 | return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT, | |||
| 2653 | NumIntermediates, RegisterVT); | |||
| 2654 | } | |||
| 2655 | ||||
| 2656 | EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, | |||
| 2657 | LLVMContext& Context, | |||
| 2658 | EVT VT) const { | |||
| 2659 | if (!VT.isVector()) | |||
| 2660 | return MVT::i8; | |||
| 2661 | ||||
| 2662 | if (Subtarget.hasAVX512()) { | |||
| 2663 | // Figure out what this type will be legalized to. | |||
| 2664 | EVT LegalVT = VT; | |||
| 2665 | while (getTypeAction(Context, LegalVT) != TypeLegal) | |||
| 2666 | LegalVT = getTypeToTransformTo(Context, LegalVT); | |||
| 2667 | ||||
| 2668 | // If we got a 512-bit vector then we'll definitely have a vXi1 compare. | |||
| 2669 | if (LegalVT.getSimpleVT().is512BitVector()) | |||
| 2670 | return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); | |||
| 2671 | ||||
| 2672 | if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { | |||
| 2673 | // If we legalized to less than a 512-bit vector, then we will use a vXi1 | |||
| 2674 | // compare for vXi32/vXi64 for sure. If we have BWI we will also support | |||
| 2675 | // vXi16/vXi8. | |||
| 2676 | MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); | |||
| 2677 | if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) | |||
| 2678 | return EVT::getVectorVT(Context, MVT::i1, VT.getVectorElementCount()); | |||
| 2679 | } | |||
| 2680 | } | |||
| 2681 | ||||
| 2682 | return VT.changeVectorElementTypeToInteger(); | |||
| 2683 | } | |||
| 2684 | ||||
| 2685 | /// Helper for getByValTypeAlignment to determine | |||
| 2686 | /// the desired ByVal argument alignment. | |||
| 2687 | static void getMaxByValAlign(Type *Ty, Align &MaxAlign) { | |||
| 2688 | if (MaxAlign == 16) | |||
| 2689 | return; | |||
| 2690 | if (VectorType *VTy = dyn_cast<VectorType>(Ty)) { | |||
| 2691 | if (VTy->getPrimitiveSizeInBits().getFixedValue() == 128) | |||
| 2692 | MaxAlign = Align(16); | |||
| 2693 | } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) { | |||
| 2694 | Align EltAlign; | |||
| 2695 | getMaxByValAlign(ATy->getElementType(), EltAlign); | |||
| 2696 | if (EltAlign > MaxAlign) | |||
| 2697 | MaxAlign = EltAlign; | |||
| 2698 | } else if (StructType *STy = dyn_cast<StructType>(Ty)) { | |||
| 2699 | for (auto *EltTy : STy->elements()) { | |||
| 2700 | Align EltAlign; | |||
| 2701 | getMaxByValAlign(EltTy, EltAlign); | |||
| 2702 | if (EltAlign > MaxAlign) | |||
| 2703 | MaxAlign = EltAlign; | |||
| 2704 | if (MaxAlign == 16) | |||
| 2705 | break; | |||
| 2706 | } | |||
| 2707 | } | |||
| 2708 | } | |||
| 2709 | ||||
| 2710 | /// Return the desired alignment for ByVal aggregate | |||
| 2711 | /// function arguments in the caller parameter area. For X86, aggregates | |||
| 2712 | /// that contain SSE vectors are placed at 16-byte boundaries while the rest | |||
| 2713 | /// are at 4-byte boundaries. | |||
| 2714 | uint64_t X86TargetLowering::getByValTypeAlignment(Type *Ty, | |||
| 2715 | const DataLayout &DL) const { | |||
| 2716 | if (Subtarget.is64Bit()) { | |||
| 2717 | // Max of 8 and alignment of type. | |||
| 2718 | Align TyAlign = DL.getABITypeAlign(Ty); | |||
| 2719 | if (TyAlign > 8) | |||
| 2720 | return TyAlign.value(); | |||
| 2721 | return 8; | |||
| 2722 | } | |||
| 2723 | ||||
| 2724 | Align Alignment(4); | |||
| 2725 | if (Subtarget.hasSSE1()) | |||
| 2726 | getMaxByValAlign(Ty, Alignment); | |||
| 2727 | return Alignment.value(); | |||
| 2728 | } | |||
| 2729 | ||||
| 2730 | /// It returns EVT::Other if the type should be determined using generic | |||
| 2731 | /// target-independent logic. | |||
| 2732 | /// For vector ops we check that the overall size isn't larger than our | |||
| 2733 | /// preferred vector width. | |||
| 2734 | EVT X86TargetLowering::getOptimalMemOpType( | |||
| 2735 | const MemOp &Op, const AttributeList &FuncAttributes) const { | |||
| 2736 | if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { | |||
| 2737 | if (Op.size() >= 16 && | |||
| 2738 | (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { | |||
| 2739 | // FIXME: Check if unaligned 64-byte accesses are slow. | |||
| 2740 | if (Op.size() >= 64 && Subtarget.hasAVX512() && | |||
| 2741 | (Subtarget.getPreferVectorWidth() >= 512)) { | |||
| 2742 | return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32; | |||
| 2743 | } | |||
| 2744 | // FIXME: Check if unaligned 32-byte accesses are slow. | |||
| 2745 | if (Op.size() >= 32 && Subtarget.hasAVX() && | |||
| 2746 | Subtarget.useLight256BitInstructions()) { | |||
| 2747 | // Although this isn't a well-supported type for AVX1, we'll let | |||
| 2748 | // legalization and shuffle lowering produce the optimal codegen. If we | |||
| 2749 | // choose an optimal type with a vector element larger than a byte, | |||
| 2750 | // getMemsetStores() may create an intermediate splat (using an integer | |||
| 2751 | // multiply) before we splat as a vector. | |||
| 2752 | return MVT::v32i8; | |||
| 2753 | } | |||
| 2754 | if (Subtarget.hasSSE2() && (Subtarget.getPreferVectorWidth() >= 128)) | |||
| 2755 | return MVT::v16i8; | |||
| 2756 | // TODO: Can SSE1 handle a byte vector? | |||
| 2757 | // If we have SSE1 registers we should be able to use them. | |||
| 2758 | if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) && | |||
| 2759 | (Subtarget.getPreferVectorWidth() >= 128)) | |||
| 2760 | return MVT::v4f32; | |||
| 2761 | } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) && | |||
| 2762 | Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) { | |||
| 2763 | // Do not use f64 to lower memcpy if source is string constant. It's | |||
| 2764 | // better to use i32 to avoid the loads. | |||
| 2765 | // Also, do not use f64 to lower memset unless this is a memset of zeros. | |||
| 2766 | // The gymnastics of splatting a byte value into an XMM register and then | |||
| 2767 | // only using 8-byte stores (because this is a CPU with slow unaligned | |||
| 2768 | // 16-byte accesses) makes that a loser. | |||
| 2769 | return MVT::f64; | |||
| 2770 | } | |||
| 2771 | } | |||
| 2772 | // This is a compromise. If we reach here, unaligned accesses may be slow on | |||
| 2773 | // this target. However, creating smaller, aligned accesses could be even | |||
| 2774 | // slower and would certainly be a lot more code. | |||
| 2775 | if (Subtarget.is64Bit() && Op.size() >= 8) | |||
| 2776 | return MVT::i64; | |||
| 2777 | return MVT::i32; | |||
| 2778 | } | |||
| 2779 | ||||
| 2780 | bool X86TargetLowering::isSafeMemOpType(MVT VT) const { | |||
| 2781 | if (VT == MVT::f32) | |||
| 2782 | return Subtarget.hasSSE1(); | |||
| 2783 | if (VT == MVT::f64) | |||
| 2784 | return Subtarget.hasSSE2(); | |||
| 2785 | return true; | |||
| 2786 | } | |||
| 2787 | ||||
| 2788 | static bool isBitAligned(Align Alignment, uint64_t SizeInBits) { | |||
| 2789 | return (8 * Alignment.value()) % SizeInBits == 0; | |||
| 2790 | } | |||
| 2791 | ||||
| 2792 | bool X86TargetLowering::isMemoryAccessFast(EVT VT, Align Alignment) const { | |||
| 2793 | if (isBitAligned(Alignment, VT.getSizeInBits())) | |||
| 2794 | return true; | |||
| 2795 | switch (VT.getSizeInBits()) { | |||
| 2796 | default: | |||
| 2797 | // 8-byte and under are always assumed to be fast. | |||
| 2798 | return true; | |||
| 2799 | case 128: | |||
| 2800 | return !Subtarget.isUnalignedMem16Slow(); | |||
| 2801 | case 256: | |||
| 2802 | return !Subtarget.isUnalignedMem32Slow(); | |||
| 2803 | // TODO: What about AVX-512 (512-bit) accesses? | |||
| 2804 | } | |||
| 2805 | } | |||
| 2806 | ||||
| 2807 | bool X86TargetLowering::allowsMisalignedMemoryAccesses( | |||
| 2808 | EVT VT, unsigned, Align Alignment, MachineMemOperand::Flags Flags, | |||
| 2809 | unsigned *Fast) const { | |||
| 2810 | if (Fast) | |||
| 2811 | *Fast = isMemoryAccessFast(VT, Alignment); | |||
| 2812 | // NonTemporal vector memory ops must be aligned. | |||
| 2813 | if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { | |||
| 2814 | // NT loads can only be vector aligned, so if its less aligned than the | |||
| 2815 | // minimum vector size (which we can split the vector down to), we might as | |||
| 2816 | // well use a regular unaligned vector load. | |||
| 2817 | // We don't have any NT loads pre-SSE41. | |||
| 2818 | if (!!(Flags & MachineMemOperand::MOLoad)) | |||
| 2819 | return (Alignment < 16 || !Subtarget.hasSSE41()); | |||
| 2820 | return false; | |||
| 2821 | } | |||
| 2822 | // Misaligned accesses of any size are always allowed. | |||
| 2823 | return true; | |||
| 2824 | } | |||
| 2825 | ||||
| 2826 | bool X86TargetLowering::allowsMemoryAccess(LLVMContext &Context, | |||
| 2827 | const DataLayout &DL, EVT VT, | |||
| 2828 | unsigned AddrSpace, Align Alignment, | |||
| 2829 | MachineMemOperand::Flags Flags, | |||
| 2830 | unsigned *Fast) const { | |||
| 2831 | if (Fast) | |||
| 2832 | *Fast = isMemoryAccessFast(VT, Alignment); | |||
| 2833 | if (!!(Flags & MachineMemOperand::MONonTemporal) && VT.isVector()) { | |||
| 2834 | if (allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, | |||
| 2835 | /*Fast=*/nullptr)) | |||
| 2836 | return true; | |||
| 2837 | // NonTemporal vector memory ops are special, and must be aligned. | |||
| 2838 | if (!isBitAligned(Alignment, VT.getSizeInBits())) | |||
| 2839 | return false; | |||
| 2840 | switch (VT.getSizeInBits()) { | |||
| 2841 | case 128: | |||
| 2842 | if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasSSE41()) | |||
| 2843 | return true; | |||
| 2844 | if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasSSE2()) | |||
| 2845 | return true; | |||
| 2846 | return false; | |||
| 2847 | case 256: | |||
| 2848 | if (!!(Flags & MachineMemOperand::MOLoad) && Subtarget.hasAVX2()) | |||
| 2849 | return true; | |||
| 2850 | if (!!(Flags & MachineMemOperand::MOStore) && Subtarget.hasAVX()) | |||
| 2851 | return true; | |||
| 2852 | return false; | |||
| 2853 | case 512: | |||
| 2854 | if (Subtarget.hasAVX512()) | |||
| 2855 | return true; | |||
| 2856 | return false; | |||
| 2857 | default: | |||
| 2858 | return false; // Don't have NonTemporal vector memory ops of this size. | |||
| 2859 | } | |||
| 2860 | } | |||
| 2861 | return true; | |||
| 2862 | } | |||
| 2863 | ||||
| 2864 | /// Return the entry encoding for a jump table in the | |||
| 2865 | /// current function. The returned value is a member of the | |||
| 2866 | /// MachineJumpTableInfo::JTEntryKind enum. | |||
| 2867 | unsigned X86TargetLowering::getJumpTableEncoding() const { | |||
| 2868 | // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF | |||
| 2869 | // symbol. | |||
| 2870 | if (isPositionIndependent() && Subtarget.isPICStyleGOT()) | |||
| 2871 | return MachineJumpTableInfo::EK_Custom32; | |||
| 2872 | ||||
| 2873 | // Otherwise, use the normal jump table encoding heuristics. | |||
| 2874 | return TargetLowering::getJumpTableEncoding(); | |||
| 2875 | } | |||
| 2876 | ||||
| 2877 | bool X86TargetLowering::splitValueIntoRegisterParts( | |||
| 2878 | SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, | |||
| 2879 | unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const { | |||
| 2880 | bool IsABIRegCopy = CC.has_value(); | |||
| 2881 | EVT ValueVT = Val.getValueType(); | |||
| 2882 | if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { | |||
| 2883 | unsigned ValueBits = ValueVT.getSizeInBits(); | |||
| 2884 | unsigned PartBits = PartVT.getSizeInBits(); | |||
| 2885 | Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val); | |||
| 2886 | Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val); | |||
| 2887 | Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val); | |||
| 2888 | Parts[0] = Val; | |||
| 2889 | return true; | |||
| 2890 | } | |||
| 2891 | return false; | |||
| 2892 | } | |||
| 2893 | ||||
| 2894 | SDValue X86TargetLowering::joinRegisterPartsIntoValue( | |||
| 2895 | SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts, | |||
| 2896 | MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const { | |||
| 2897 | bool IsABIRegCopy = CC.has_value(); | |||
| 2898 | if (IsABIRegCopy && ValueVT == MVT::bf16 && PartVT == MVT::f32) { | |||
| 2899 | unsigned ValueBits = ValueVT.getSizeInBits(); | |||
| 2900 | unsigned PartBits = PartVT.getSizeInBits(); | |||
| 2901 | SDValue Val = Parts[0]; | |||
| 2902 | ||||
| 2903 | Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val); | |||
| 2904 | Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val); | |||
| 2905 | Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val); | |||
| 2906 | return Val; | |||
| 2907 | } | |||
| 2908 | return SDValue(); | |||
| 2909 | } | |||
| 2910 | ||||
| 2911 | bool X86TargetLowering::useSoftFloat() const { | |||
| 2912 | return Subtarget.useSoftFloat(); | |||
| 2913 | } | |||
| 2914 | ||||
| 2915 | void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, | |||
| 2916 | ArgListTy &Args) const { | |||
| 2917 | ||||
| 2918 | // Only relabel X86-32 for C / Stdcall CCs. | |||
| 2919 | if (Subtarget.is64Bit()) | |||
| 2920 | return; | |||
| 2921 | if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) | |||
| 2922 | return; | |||
| 2923 | unsigned ParamRegs = 0; | |||
| 2924 | if (auto *M = MF->getFunction().getParent()) | |||
| 2925 | ParamRegs = M->getNumberRegisterParameters(); | |||
| 2926 | ||||
| 2927 | // Mark the first N int arguments as having reg | |||
| 2928 | for (auto &Arg : Args) { | |||
| 2929 | Type *T = Arg.Ty; | |||
| 2930 | if (T->isIntOrPtrTy()) | |||
| 2931 | if (MF->getDataLayout().getTypeAllocSize(T) <= 8) { | |||
| 2932 | unsigned numRegs = 1; | |||
| 2933 | if (MF->getDataLayout().getTypeAllocSize(T) > 4) | |||
| 2934 | numRegs = 2; | |||
| 2935 | if (ParamRegs < numRegs) | |||
| 2936 | return; | |||
| 2937 | ParamRegs -= numRegs; | |||
| 2938 | Arg.IsInReg = true; | |||
| 2939 | } | |||
| 2940 | } | |||
| 2941 | } | |||
| 2942 | ||||
| 2943 | const MCExpr * | |||
| 2944 | X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, | |||
| 2945 | const MachineBasicBlock *MBB, | |||
| 2946 | unsigned uid,MCContext &Ctx) const{ | |||
| 2947 | assert(isPositionIndependent() && Subtarget.isPICStyleGOT())(static_cast <bool> (isPositionIndependent() && Subtarget.isPICStyleGOT()) ? void (0) : __assert_fail ("isPositionIndependent() && Subtarget.isPICStyleGOT()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 2947, __extension__ __PRETTY_FUNCTION__)); | |||
| 2948 | // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF | |||
| 2949 | // entries. | |||
| 2950 | return MCSymbolRefExpr::create(MBB->getSymbol(), | |||
| 2951 | MCSymbolRefExpr::VK_GOTOFF, Ctx); | |||
| 2952 | } | |||
| 2953 | ||||
| 2954 | /// Returns relocation base for the given PIC jumptable. | |||
| 2955 | SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, | |||
| 2956 | SelectionDAG &DAG) const { | |||
| 2957 | if (!Subtarget.is64Bit()) | |||
| 2958 | // This doesn't have SDLoc associated with it, but is not really the | |||
| 2959 | // same as a Register. | |||
| 2960 | return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), | |||
| 2961 | getPointerTy(DAG.getDataLayout())); | |||
| 2962 | return Table; | |||
| 2963 | } | |||
| 2964 | ||||
| 2965 | /// This returns the relocation base for the given PIC jumptable, | |||
| 2966 | /// the same as getPICJumpTableRelocBase, but as an MCExpr. | |||
| 2967 | const MCExpr *X86TargetLowering:: | |||
| 2968 | getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, | |||
| 2969 | MCContext &Ctx) const { | |||
| 2970 | // X86-64 uses RIP relative addressing based on the jump table label. | |||
| 2971 | if (Subtarget.isPICStyleRIPRel()) | |||
| 2972 | return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); | |||
| 2973 | ||||
| 2974 | // Otherwise, the reference is relative to the PIC base. | |||
| 2975 | return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx); | |||
| 2976 | } | |||
| 2977 | ||||
| 2978 | std::pair<const TargetRegisterClass *, uint8_t> | |||
| 2979 | X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI, | |||
| 2980 | MVT VT) const { | |||
| 2981 | const TargetRegisterClass *RRC = nullptr; | |||
| 2982 | uint8_t Cost = 1; | |||
| 2983 | switch (VT.SimpleTy) { | |||
| 2984 | default: | |||
| 2985 | return TargetLowering::findRepresentativeClass(TRI, VT); | |||
| 2986 | case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: | |||
| 2987 | RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass; | |||
| 2988 | break; | |||
| 2989 | case MVT::x86mmx: | |||
| 2990 | RRC = &X86::VR64RegClass; | |||
| 2991 | break; | |||
| 2992 | case MVT::f32: case MVT::f64: | |||
| 2993 | case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: | |||
| 2994 | case MVT::v4f32: case MVT::v2f64: | |||
| 2995 | case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64: | |||
| 2996 | case MVT::v8f32: case MVT::v4f64: | |||
| 2997 | case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: | |||
| 2998 | case MVT::v16f32: case MVT::v8f64: | |||
| 2999 | RRC = &X86::VR128XRegClass; | |||
| 3000 | break; | |||
| 3001 | } | |||
| 3002 | return std::make_pair(RRC, Cost); | |||
| 3003 | } | |||
| 3004 | ||||
| 3005 | unsigned X86TargetLowering::getAddressSpace() const { | |||
| 3006 | if (Subtarget.is64Bit()) | |||
| 3007 | return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257; | |||
| 3008 | return 256; | |||
| 3009 | } | |||
| 3010 | ||||
| 3011 | static bool hasStackGuardSlotTLS(const Triple &TargetTriple) { | |||
| 3012 | return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() || | |||
| 3013 | (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17)); | |||
| 3014 | } | |||
| 3015 | ||||
| 3016 | static Constant* SegmentOffset(IRBuilderBase &IRB, | |||
| 3017 | int Offset, unsigned AddressSpace) { | |||
| 3018 | return ConstantExpr::getIntToPtr( | |||
| 3019 | ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset), | |||
| 3020 | Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace)); | |||
| 3021 | } | |||
| 3022 | ||||
| 3023 | Value *X86TargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { | |||
| 3024 | // glibc, bionic, and Fuchsia have a special slot for the stack guard in | |||
| 3025 | // tcbhead_t; use it instead of the usual global variable (see | |||
| 3026 | // sysdeps/{i386,x86_64}/nptl/tls.h) | |||
| 3027 | if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { | |||
| 3028 | if (Subtarget.isTargetFuchsia()) { | |||
| 3029 | // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. | |||
| 3030 | return SegmentOffset(IRB, 0x10, getAddressSpace()); | |||
| 3031 | } else { | |||
| 3032 | unsigned AddressSpace = getAddressSpace(); | |||
| 3033 | Module *M = IRB.GetInsertBlock()->getParent()->getParent(); | |||
| 3034 | // Specially, some users may customize the base reg and offset. | |||
| 3035 | int Offset = M->getStackProtectorGuardOffset(); | |||
| 3036 | // If we don't set -stack-protector-guard-offset value: | |||
| 3037 | // %fs:0x28, unless we're using a Kernel code model, in which case | |||
| 3038 | // it's %gs:0x28. gs:0x14 on i386. | |||
| 3039 | if (Offset == INT_MAX2147483647) | |||
| 3040 | Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14; | |||
| 3041 | ||||
| 3042 | StringRef GuardReg = M->getStackProtectorGuardReg(); | |||
| 3043 | if (GuardReg == "fs") | |||
| 3044 | AddressSpace = X86AS::FS; | |||
| 3045 | else if (GuardReg == "gs") | |||
| 3046 | AddressSpace = X86AS::GS; | |||
| 3047 | ||||
| 3048 | // Use symbol guard if user specify. | |||
| 3049 | StringRef GuardSymb = M->getStackProtectorGuardSymbol(); | |||
| 3050 | if (!GuardSymb.empty()) { | |||
| 3051 | GlobalVariable *GV = M->getGlobalVariable(GuardSymb); | |||
| 3052 | if (!GV) { | |||
| 3053 | Type *Ty = Subtarget.is64Bit() ? Type::getInt64Ty(M->getContext()) | |||
| 3054 | : Type::getInt32Ty(M->getContext()); | |||
| 3055 | GV = new GlobalVariable(*M, Ty, false, GlobalValue::ExternalLinkage, | |||
| 3056 | nullptr, GuardSymb, nullptr, | |||
| 3057 | GlobalValue::NotThreadLocal, AddressSpace); | |||
| 3058 | } | |||
| 3059 | return GV; | |||
| 3060 | } | |||
| 3061 | ||||
| 3062 | return SegmentOffset(IRB, Offset, AddressSpace); | |||
| 3063 | } | |||
| 3064 | } | |||
| 3065 | return TargetLowering::getIRStackGuard(IRB); | |||
| 3066 | } | |||
| 3067 | ||||
| 3068 | void X86TargetLowering::insertSSPDeclarations(Module &M) const { | |||
| 3069 | // MSVC CRT provides functionalities for stack protection. | |||
| 3070 | if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || | |||
| 3071 | Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { | |||
| 3072 | // MSVC CRT has a global variable holding security cookie. | |||
| 3073 | M.getOrInsertGlobal("__security_cookie", | |||
| 3074 | Type::getInt8PtrTy(M.getContext())); | |||
| 3075 | ||||
| 3076 | // MSVC CRT has a function to validate security cookie. | |||
| 3077 | FunctionCallee SecurityCheckCookie = M.getOrInsertFunction( | |||
| 3078 | "__security_check_cookie", Type::getVoidTy(M.getContext()), | |||
| 3079 | Type::getInt8PtrTy(M.getContext())); | |||
| 3080 | if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee())) { | |||
| 3081 | F->setCallingConv(CallingConv::X86_FastCall); | |||
| 3082 | F->addParamAttr(0, Attribute::AttrKind::InReg); | |||
| 3083 | } | |||
| 3084 | return; | |||
| 3085 | } | |||
| 3086 | ||||
| 3087 | StringRef GuardMode = M.getStackProtectorGuard(); | |||
| 3088 | ||||
| 3089 | // glibc, bionic, and Fuchsia have a special slot for the stack guard. | |||
| 3090 | if ((GuardMode == "tls" || GuardMode.empty()) && | |||
| 3091 | hasStackGuardSlotTLS(Subtarget.getTargetTriple())) | |||
| 3092 | return; | |||
| 3093 | TargetLowering::insertSSPDeclarations(M); | |||
| 3094 | } | |||
| 3095 | ||||
| 3096 | Value *X86TargetLowering::getSDagStackGuard(const Module &M) const { | |||
| 3097 | // MSVC CRT has a global variable holding security cookie. | |||
| 3098 | if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || | |||
| 3099 | Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { | |||
| 3100 | return M.getGlobalVariable("__security_cookie"); | |||
| 3101 | } | |||
| 3102 | return TargetLowering::getSDagStackGuard(M); | |||
| 3103 | } | |||
| 3104 | ||||
| 3105 | Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const { | |||
| 3106 | // MSVC CRT has a function to validate security cookie. | |||
| 3107 | if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() || | |||
| 3108 | Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) { | |||
| 3109 | return M.getFunction("__security_check_cookie"); | |||
| 3110 | } | |||
| 3111 | return TargetLowering::getSSPStackGuardCheck(M); | |||
| 3112 | } | |||
| 3113 | ||||
| 3114 | Value * | |||
| 3115 | X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const { | |||
| 3116 | if (Subtarget.getTargetTriple().isOSContiki()) | |||
| 3117 | return getDefaultSafeStackPointerLocation(IRB, false); | |||
| 3118 | ||||
| 3119 | // Android provides a fixed TLS slot for the SafeStack pointer. See the | |||
| 3120 | // definition of TLS_SLOT_SAFESTACK in | |||
| 3121 | // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h | |||
| 3122 | if (Subtarget.isTargetAndroid()) { | |||
| 3123 | // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs: | |||
| 3124 | // %gs:0x24 on i386 | |||
| 3125 | int Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24; | |||
| 3126 | return SegmentOffset(IRB, Offset, getAddressSpace()); | |||
| 3127 | } | |||
| 3128 | ||||
| 3129 | // Fuchsia is similar. | |||
| 3130 | if (Subtarget.isTargetFuchsia()) { | |||
| 3131 | // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. | |||
| 3132 | return SegmentOffset(IRB, 0x18, getAddressSpace()); | |||
| 3133 | } | |||
| 3134 | ||||
| 3135 | return TargetLowering::getSafeStackPointerLocation(IRB); | |||
| 3136 | } | |||
| 3137 | ||||
| 3138 | //===----------------------------------------------------------------------===// | |||
| 3139 | // Return Value Calling Convention Implementation | |||
| 3140 | //===----------------------------------------------------------------------===// | |||
| 3141 | ||||
| 3142 | bool X86TargetLowering::CanLowerReturn( | |||
| 3143 | CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, | |||
| 3144 | const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const { | |||
| 3145 | SmallVector<CCValAssign, 16> RVLocs; | |||
| 3146 | CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context); | |||
| 3147 | return CCInfo.CheckReturn(Outs, RetCC_X86); | |||
| 3148 | } | |||
| 3149 | ||||
| 3150 | const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { | |||
| 3151 | static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; | |||
| 3152 | return ScratchRegs; | |||
| 3153 | } | |||
| 3154 | ||||
| 3155 | ArrayRef<MCPhysReg> X86TargetLowering::getRoundingControlRegisters() const { | |||
| 3156 | // FIXME: We should def X86::FPCW for x87 as well. But it affects a lot of lit | |||
| 3157 | // tests at the moment, which is not what we expected. | |||
| 3158 | static const MCPhysReg RCRegs[] = {X86::MXCSR}; | |||
| 3159 | return RCRegs; | |||
| 3160 | } | |||
| 3161 | ||||
| 3162 | /// Lowers masks values (v*i1) to the local register values | |||
| 3163 | /// \returns DAG node after lowering to register type | |||
| 3164 | static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc, | |||
| 3165 | const SDLoc &Dl, SelectionDAG &DAG) { | |||
| 3166 | EVT ValVT = ValArg.getValueType(); | |||
| 3167 | ||||
| 3168 | if (ValVT == MVT::v1i1) | |||
| 3169 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, Dl, ValLoc, ValArg, | |||
| 3170 | DAG.getIntPtrConstant(0, Dl)); | |||
| 3171 | ||||
| 3172 | if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) || | |||
| 3173 | (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) { | |||
| 3174 | // Two stage lowering might be required | |||
| 3175 | // bitcast: v8i1 -> i8 / v16i1 -> i16 | |||
| 3176 | // anyextend: i8 -> i32 / i16 -> i32 | |||
| 3177 | EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16; | |||
| 3178 | SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg); | |||
| 3179 | if (ValLoc == MVT::i32) | |||
| 3180 | ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy); | |||
| 3181 | return ValToCopy; | |||
| 3182 | } | |||
| 3183 | ||||
| 3184 | if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) || | |||
| 3185 | (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) { | |||
| 3186 | // One stage lowering is required | |||
| 3187 | // bitcast: v32i1 -> i32 / v64i1 -> i64 | |||
| 3188 | return DAG.getBitcast(ValLoc, ValArg); | |||
| 3189 | } | |||
| 3190 | ||||
| 3191 | return DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValArg); | |||
| 3192 | } | |||
| 3193 | ||||
| 3194 | /// Breaks v64i1 value into two registers and adds the new node to the DAG | |||
| 3195 | static void Passv64i1ArgInRegs( | |||
| 3196 | const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg, | |||
| 3197 | SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA, | |||
| 3198 | CCValAssign &NextVA, const X86Subtarget &Subtarget) { | |||
| 3199 | assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3199, __extension__ __PRETTY_FUNCTION__)); | |||
| 3200 | assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target" ) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3200, __extension__ __PRETTY_FUNCTION__)); | |||
| 3201 | assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value")(static_cast <bool> (Arg.getValueType() == MVT::i64 && "Expecting 64 bit value") ? void (0) : __assert_fail ("Arg.getValueType() == MVT::i64 && \"Expecting 64 bit value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3201, __extension__ __PRETTY_FUNCTION__)); | |||
| 3202 | assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc () && "The value should reside in two registers") ? void (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__ __PRETTY_FUNCTION__)) | |||
| 3203 | "The value should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc () && "The value should reside in two registers") ? void (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The value should reside in two registers\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3203, __extension__ __PRETTY_FUNCTION__)); | |||
| 3204 | ||||
| 3205 | // Before splitting the value we cast it to i64 | |||
| 3206 | Arg = DAG.getBitcast(MVT::i64, Arg); | |||
| 3207 | ||||
| 3208 | // Splitting the value into two i32 types | |||
| 3209 | SDValue Lo, Hi; | |||
| 3210 | std::tie(Lo, Hi) = DAG.SplitScalar(Arg, Dl, MVT::i32, MVT::i32); | |||
| 3211 | ||||
| 3212 | // Attach the two i32 types into corresponding registers | |||
| 3213 | RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo)); | |||
| 3214 | RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi)); | |||
| 3215 | } | |||
| 3216 | ||||
| 3217 | SDValue | |||
| 3218 | X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, | |||
| 3219 | bool isVarArg, | |||
| 3220 | const SmallVectorImpl<ISD::OutputArg> &Outs, | |||
| 3221 | const SmallVectorImpl<SDValue> &OutVals, | |||
| 3222 | const SDLoc &dl, SelectionDAG &DAG) const { | |||
| 3223 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 3224 | X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 3225 | ||||
| 3226 | // In some cases we need to disable registers from the default CSR list. | |||
| 3227 | // For example, when they are used as return registers (preserve_* and X86's | |||
| 3228 | // regcall) or for argument passing (X86's regcall). | |||
| 3229 | bool ShouldDisableCalleeSavedRegister = | |||
| 3230 | shouldDisableRetRegFromCSR(CallConv) || | |||
| 3231 | MF.getFunction().hasFnAttribute("no_caller_saved_registers"); | |||
| 3232 | ||||
| 3233 | if (CallConv == CallingConv::X86_INTR && !Outs.empty()) | |||
| 3234 | report_fatal_error("X86 interrupts may not return any value"); | |||
| 3235 | ||||
| 3236 | SmallVector<CCValAssign, 16> RVLocs; | |||
| 3237 | CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext()); | |||
| 3238 | CCInfo.AnalyzeReturn(Outs, RetCC_X86); | |||
| 3239 | ||||
| 3240 | SmallVector<std::pair<Register, SDValue>, 4> RetVals; | |||
| 3241 | for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E; | |||
| 3242 | ++I, ++OutsIndex) { | |||
| 3243 | CCValAssign &VA = RVLocs[I]; | |||
| 3244 | assert(VA.isRegLoc() && "Can only return in registers!")(static_cast <bool> (VA.isRegLoc() && "Can only return in registers!" ) ? void (0) : __assert_fail ("VA.isRegLoc() && \"Can only return in registers!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3244, __extension__ __PRETTY_FUNCTION__)); | |||
| 3245 | ||||
| 3246 | // Add the register to the CalleeSaveDisableRegs list. | |||
| 3247 | if (ShouldDisableCalleeSavedRegister) | |||
| 3248 | MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg()); | |||
| 3249 | ||||
| 3250 | SDValue ValToCopy = OutVals[OutsIndex]; | |||
| 3251 | EVT ValVT = ValToCopy.getValueType(); | |||
| 3252 | ||||
| 3253 | // Promote values to the appropriate types. | |||
| 3254 | if (VA.getLocInfo() == CCValAssign::SExt) | |||
| 3255 | ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); | |||
| 3256 | else if (VA.getLocInfo() == CCValAssign::ZExt) | |||
| 3257 | ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); | |||
| 3258 | else if (VA.getLocInfo() == CCValAssign::AExt) { | |||
| 3259 | if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1) | |||
| 3260 | ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG); | |||
| 3261 | else | |||
| 3262 | ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); | |||
| 3263 | } | |||
| 3264 | else if (VA.getLocInfo() == CCValAssign::BCvt) | |||
| 3265 | ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy); | |||
| 3266 | ||||
| 3267 | assert(VA.getLocInfo() != CCValAssign::FPExt &&(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value.") ? void ( 0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__ __PRETTY_FUNCTION__)) | |||
| 3268 | "Unexpected FP-extend for return value.")(static_cast <bool> (VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value.") ? void ( 0) : __assert_fail ("VA.getLocInfo() != CCValAssign::FPExt && \"Unexpected FP-extend for return value.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3268, __extension__ __PRETTY_FUNCTION__)); | |||
| 3269 | ||||
| 3270 | // Report an error if we have attempted to return a value via an XMM | |||
| 3271 | // register and SSE was disabled. | |||
| 3272 | if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { | |||
| 3273 | errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); | |||
| 3274 | VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. | |||
| 3275 | } else if (!Subtarget.hasSSE2() && | |||
| 3276 | X86::FR64XRegClass.contains(VA.getLocReg()) && | |||
| 3277 | ValVT == MVT::f64) { | |||
| 3278 | // When returning a double via an XMM register, report an error if SSE2 is | |||
| 3279 | // not enabled. | |||
| 3280 | errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); | |||
| 3281 | VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. | |||
| 3282 | } | |||
| 3283 | ||||
| 3284 | // Returns in ST0/ST1 are handled specially: these are pushed as operands to | |||
| 3285 | // the RET instruction and handled by the FP Stackifier. | |||
| 3286 | if (VA.getLocReg() == X86::FP0 || | |||
| 3287 | VA.getLocReg() == X86::FP1) { | |||
| 3288 | // If this is a copy from an xmm register to ST(0), use an FPExtend to | |||
| 3289 | // change the value to the FP stack register class. | |||
| 3290 | if (isScalarFPTypeInSSEReg(VA.getValVT())) | |||
| 3291 | ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); | |||
| 3292 | RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); | |||
| 3293 | // Don't emit a copytoreg. | |||
| 3294 | continue; | |||
| 3295 | } | |||
| 3296 | ||||
| 3297 | // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 | |||
| 3298 | // which is returned in RAX / RDX. | |||
| 3299 | if (Subtarget.is64Bit()) { | |||
| 3300 | if (ValVT == MVT::x86mmx) { | |||
| 3301 | if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { | |||
| 3302 | ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy); | |||
| 3303 | ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, | |||
| 3304 | ValToCopy); | |||
| 3305 | // If we don't have SSE2 available, convert to v4f32 so the generated | |||
| 3306 | // register is legal. | |||
| 3307 | if (!Subtarget.hasSSE2()) | |||
| 3308 | ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy); | |||
| 3309 | } | |||
| 3310 | } | |||
| 3311 | } | |||
| 3312 | ||||
| 3313 | if (VA.needsCustom()) { | |||
| 3314 | assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__ __PRETTY_FUNCTION__)) | |||
| 3315 | "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3315, __extension__ __PRETTY_FUNCTION__)); | |||
| 3316 | ||||
| 3317 | Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I], | |||
| 3318 | Subtarget); | |||
| 3319 | ||||
| 3320 | // Add the second register to the CalleeSaveDisableRegs list. | |||
| 3321 | if (ShouldDisableCalleeSavedRegister) | |||
| 3322 | MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg()); | |||
| 3323 | } else { | |||
| 3324 | RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy)); | |||
| 3325 | } | |||
| 3326 | } | |||
| 3327 | ||||
| 3328 | SDValue Glue; | |||
| 3329 | SmallVector<SDValue, 6> RetOps; | |||
| 3330 | RetOps.push_back(Chain); // Operand #0 = Chain (updated below) | |||
| 3331 | // Operand #1 = Bytes To Pop | |||
| 3332 | RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl, | |||
| 3333 | MVT::i32)); | |||
| 3334 | ||||
| 3335 | // Copy the result values into the output registers. | |||
| 3336 | for (auto &RetVal : RetVals) { | |||
| 3337 | if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) { | |||
| 3338 | RetOps.push_back(RetVal.second); | |||
| 3339 | continue; // Don't emit a copytoreg. | |||
| 3340 | } | |||
| 3341 | ||||
| 3342 | Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Glue); | |||
| 3343 | Glue = Chain.getValue(1); | |||
| 3344 | RetOps.push_back( | |||
| 3345 | DAG.getRegister(RetVal.first, RetVal.second.getValueType())); | |||
| 3346 | } | |||
| 3347 | ||||
| 3348 | // Swift calling convention does not require we copy the sret argument | |||
| 3349 | // into %rax/%eax for the return, and SRetReturnReg is not set for Swift. | |||
| 3350 | ||||
| 3351 | // All x86 ABIs require that for returning structs by value we copy | |||
| 3352 | // the sret argument into %rax/%eax (depending on ABI) for the return. | |||
| 3353 | // We saved the argument into a virtual register in the entry block, | |||
| 3354 | // so now we copy the value out and into %rax/%eax. | |||
| 3355 | // | |||
| 3356 | // Checking Function.hasStructRetAttr() here is insufficient because the IR | |||
| 3357 | // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is | |||
| 3358 | // false, then an sret argument may be implicitly inserted in the SelDAG. In | |||
| 3359 | // either case FuncInfo->setSRetReturnReg() will have been called. | |||
| 3360 | if (Register SRetReg = FuncInfo->getSRetReturnReg()) { | |||
| 3361 | // When we have both sret and another return value, we should use the | |||
| 3362 | // original Chain stored in RetOps[0], instead of the current Chain updated | |||
| 3363 | // in the above loop. If we only have sret, RetOps[0] equals to Chain. | |||
| 3364 | ||||
| 3365 | // For the case of sret and another return value, we have | |||
| 3366 | // Chain_0 at the function entry | |||
| 3367 | // Chain_1 = getCopyToReg(Chain_0) in the above loop | |||
| 3368 | // If we use Chain_1 in getCopyFromReg, we will have | |||
| 3369 | // Val = getCopyFromReg(Chain_1) | |||
| 3370 | // Chain_2 = getCopyToReg(Chain_1, Val) from below | |||
| 3371 | ||||
| 3372 | // getCopyToReg(Chain_0) will be glued together with | |||
| 3373 | // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be | |||
| 3374 | // in Unit B, and we will have cyclic dependency between Unit A and Unit B: | |||
| 3375 | // Data dependency from Unit B to Unit A due to usage of Val in | |||
| 3376 | // getCopyToReg(Chain_1, Val) | |||
| 3377 | // Chain dependency from Unit A to Unit B | |||
| 3378 | ||||
| 3379 | // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg. | |||
| 3380 | SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg, | |||
| 3381 | getPointerTy(MF.getDataLayout())); | |||
| 3382 | ||||
| 3383 | Register RetValReg | |||
| 3384 | = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ? | |||
| 3385 | X86::RAX : X86::EAX; | |||
| 3386 | Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Glue); | |||
| 3387 | Glue = Chain.getValue(1); | |||
| 3388 | ||||
| 3389 | // RAX/EAX now acts like a return value. | |||
| 3390 | RetOps.push_back( | |||
| 3391 | DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout()))); | |||
| 3392 | ||||
| 3393 | // Add the returned register to the CalleeSaveDisableRegs list. Don't do | |||
| 3394 | // this however for preserve_most/preserve_all to minimize the number of | |||
| 3395 | // callee-saved registers for these CCs. | |||
| 3396 | if (ShouldDisableCalleeSavedRegister && | |||
| 3397 | CallConv != CallingConv::PreserveAll && | |||
| 3398 | CallConv != CallingConv::PreserveMost) | |||
| 3399 | MF.getRegInfo().disableCalleeSavedRegister(RetValReg); | |||
| 3400 | } | |||
| 3401 | ||||
| 3402 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 3403 | const MCPhysReg *I = | |||
| 3404 | TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction()); | |||
| 3405 | if (I) { | |||
| 3406 | for (; *I; ++I) { | |||
| 3407 | if (X86::GR64RegClass.contains(*I)) | |||
| 3408 | RetOps.push_back(DAG.getRegister(*I, MVT::i64)); | |||
| 3409 | else | |||
| 3410 | llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3410); | |||
| 3411 | } | |||
| 3412 | } | |||
| 3413 | ||||
| 3414 | RetOps[0] = Chain; // Update chain. | |||
| 3415 | ||||
| 3416 | // Add the glue if we have it. | |||
| 3417 | if (Glue.getNode()) | |||
| 3418 | RetOps.push_back(Glue); | |||
| 3419 | ||||
| 3420 | X86ISD::NodeType opcode = X86ISD::RET_GLUE; | |||
| 3421 | if (CallConv == CallingConv::X86_INTR) | |||
| 3422 | opcode = X86ISD::IRET; | |||
| 3423 | return DAG.getNode(opcode, dl, MVT::Other, RetOps); | |||
| 3424 | } | |||
| 3425 | ||||
| 3426 | bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { | |||
| 3427 | if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0)) | |||
| 3428 | return false; | |||
| 3429 | ||||
| 3430 | SDValue TCChain = Chain; | |||
| 3431 | SDNode *Copy = *N->use_begin(); | |||
| 3432 | if (Copy->getOpcode() == ISD::CopyToReg) { | |||
| 3433 | // If the copy has a glue operand, we conservatively assume it isn't safe to | |||
| 3434 | // perform a tail call. | |||
| 3435 | if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) | |||
| 3436 | return false; | |||
| 3437 | TCChain = Copy->getOperand(0); | |||
| 3438 | } else if (Copy->getOpcode() != ISD::FP_EXTEND) | |||
| 3439 | return false; | |||
| 3440 | ||||
| 3441 | bool HasRet = false; | |||
| 3442 | for (const SDNode *U : Copy->uses()) { | |||
| 3443 | if (U->getOpcode() != X86ISD::RET_GLUE) | |||
| 3444 | return false; | |||
| 3445 | // If we are returning more than one value, we can definitely | |||
| 3446 | // not make a tail call see PR19530 | |||
| 3447 | if (U->getNumOperands() > 4) | |||
| 3448 | return false; | |||
| 3449 | if (U->getNumOperands() == 4 && | |||
| 3450 | U->getOperand(U->getNumOperands() - 1).getValueType() != MVT::Glue) | |||
| 3451 | return false; | |||
| 3452 | HasRet = true; | |||
| 3453 | } | |||
| 3454 | ||||
| 3455 | if (!HasRet) | |||
| 3456 | return false; | |||
| 3457 | ||||
| 3458 | Chain = TCChain; | |||
| 3459 | return true; | |||
| 3460 | } | |||
| 3461 | ||||
| 3462 | EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT, | |||
| 3463 | ISD::NodeType ExtendKind) const { | |||
| 3464 | MVT ReturnMVT = MVT::i32; | |||
| 3465 | ||||
| 3466 | bool Darwin = Subtarget.getTargetTriple().isOSDarwin(); | |||
| 3467 | if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) { | |||
| 3468 | // The ABI does not require i1, i8 or i16 to be extended. | |||
| 3469 | // | |||
| 3470 | // On Darwin, there is code in the wild relying on Clang's old behaviour of | |||
| 3471 | // always extending i8/i16 return values, so keep doing that for now. | |||
| 3472 | // (PR26665). | |||
| 3473 | ReturnMVT = MVT::i8; | |||
| 3474 | } | |||
| 3475 | ||||
| 3476 | EVT MinVT = getRegisterType(Context, ReturnMVT); | |||
| 3477 | return VT.bitsLT(MinVT) ? MinVT : VT; | |||
| 3478 | } | |||
| 3479 | ||||
| 3480 | /// Reads two 32 bit registers and creates a 64 bit mask value. | |||
| 3481 | /// \param VA The current 32 bit value that need to be assigned. | |||
| 3482 | /// \param NextVA The next 32 bit value that need to be assigned. | |||
| 3483 | /// \param Root The parent DAG node. | |||
| 3484 | /// \param [in,out] InGlue Represents SDvalue in the parent DAG node for | |||
| 3485 | /// glue purposes. In the case the DAG is already using | |||
| 3486 | /// physical register instead of virtual, we should glue | |||
| 3487 | /// our new SDValue to InGlue SDvalue. | |||
| 3488 | /// \return a new SDvalue of size 64bit. | |||
| 3489 | static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA, | |||
| 3490 | SDValue &Root, SelectionDAG &DAG, | |||
| 3491 | const SDLoc &Dl, const X86Subtarget &Subtarget, | |||
| 3492 | SDValue *InGlue = nullptr) { | |||
| 3493 | assert((Subtarget.hasBWI()) && "Expected AVX512BW target!")(static_cast <bool> ((Subtarget.hasBWI()) && "Expected AVX512BW target!" ) ? void (0) : __assert_fail ("(Subtarget.hasBWI()) && \"Expected AVX512BW target!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3493, __extension__ __PRETTY_FUNCTION__)); | |||
| 3494 | assert(Subtarget.is32Bit() && "Expecting 32 bit target")(static_cast <bool> (Subtarget.is32Bit() && "Expecting 32 bit target" ) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"Expecting 32 bit target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3494, __extension__ __PRETTY_FUNCTION__)); | |||
| 3495 | assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type") ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__ __PRETTY_FUNCTION__)) | |||
| 3496 | "Expecting first location of 64 bit width type")(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Expecting first location of 64 bit width type") ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Expecting first location of 64 bit width type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3496, __extension__ __PRETTY_FUNCTION__)); | |||
| 3497 | assert(NextVA.getValVT() == VA.getValVT() &&(static_cast <bool> (NextVA.getValVT() == VA.getValVT() && "The locations should have the same type") ? void (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__ __PRETTY_FUNCTION__)) | |||
| 3498 | "The locations should have the same type")(static_cast <bool> (NextVA.getValVT() == VA.getValVT() && "The locations should have the same type") ? void (0) : __assert_fail ("NextVA.getValVT() == VA.getValVT() && \"The locations should have the same type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3498, __extension__ __PRETTY_FUNCTION__)); | |||
| 3499 | assert(VA.isRegLoc() && NextVA.isRegLoc() &&(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc () && "The values should reside in two registers") ? void (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__ __PRETTY_FUNCTION__)) | |||
| 3500 | "The values should reside in two registers")(static_cast <bool> (VA.isRegLoc() && NextVA.isRegLoc () && "The values should reside in two registers") ? void (0) : __assert_fail ("VA.isRegLoc() && NextVA.isRegLoc() && \"The values should reside in two registers\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3500, __extension__ __PRETTY_FUNCTION__)); | |||
| 3501 | ||||
| 3502 | SDValue Lo, Hi; | |||
| 3503 | SDValue ArgValueLo, ArgValueHi; | |||
| 3504 | ||||
| 3505 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 3506 | const TargetRegisterClass *RC = &X86::GR32RegClass; | |||
| 3507 | ||||
| 3508 | // Read a 32 bit value from the registers. | |||
| 3509 | if (nullptr == InGlue) { | |||
| 3510 | // When no physical register is present, | |||
| 3511 | // create an intermediate virtual register. | |||
| 3512 | Register Reg = MF.addLiveIn(VA.getLocReg(), RC); | |||
| 3513 | ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); | |||
| 3514 | Reg = MF.addLiveIn(NextVA.getLocReg(), RC); | |||
| 3515 | ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32); | |||
| 3516 | } else { | |||
| 3517 | // When a physical register is available read the value from it and glue | |||
| 3518 | // the reads together. | |||
| 3519 | ArgValueLo = | |||
| 3520 | DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InGlue); | |||
| 3521 | *InGlue = ArgValueLo.getValue(2); | |||
| 3522 | ArgValueHi = | |||
| 3523 | DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InGlue); | |||
| 3524 | *InGlue = ArgValueHi.getValue(2); | |||
| 3525 | } | |||
| 3526 | ||||
| 3527 | // Convert the i32 type into v32i1 type. | |||
| 3528 | Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo); | |||
| 3529 | ||||
| 3530 | // Convert the i32 type into v32i1 type. | |||
| 3531 | Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi); | |||
| 3532 | ||||
| 3533 | // Concatenate the two values together. | |||
| 3534 | return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi); | |||
| 3535 | } | |||
| 3536 | ||||
| 3537 | /// The function will lower a register of various sizes (8/16/32/64) | |||
| 3538 | /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1) | |||
| 3539 | /// \returns a DAG node contains the operand after lowering to mask type. | |||
| 3540 | static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT, | |||
| 3541 | const EVT &ValLoc, const SDLoc &Dl, | |||
| 3542 | SelectionDAG &DAG) { | |||
| 3543 | SDValue ValReturned = ValArg; | |||
| 3544 | ||||
| 3545 | if (ValVT == MVT::v1i1) | |||
| 3546 | return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned); | |||
| 3547 | ||||
| 3548 | if (ValVT == MVT::v64i1) { | |||
| 3549 | // In 32 bit machine, this case is handled by getv64i1Argument | |||
| 3550 | assert(ValLoc == MVT::i64 && "Expecting only i64 locations")(static_cast <bool> (ValLoc == MVT::i64 && "Expecting only i64 locations" ) ? void (0) : __assert_fail ("ValLoc == MVT::i64 && \"Expecting only i64 locations\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3550, __extension__ __PRETTY_FUNCTION__)); | |||
| 3551 | // In 64 bit machine, There is no need to truncate the value only bitcast | |||
| 3552 | } else { | |||
| 3553 | MVT maskLen; | |||
| 3554 | switch (ValVT.getSimpleVT().SimpleTy) { | |||
| 3555 | case MVT::v8i1: | |||
| 3556 | maskLen = MVT::i8; | |||
| 3557 | break; | |||
| 3558 | case MVT::v16i1: | |||
| 3559 | maskLen = MVT::i16; | |||
| 3560 | break; | |||
| 3561 | case MVT::v32i1: | |||
| 3562 | maskLen = MVT::i32; | |||
| 3563 | break; | |||
| 3564 | default: | |||
| 3565 | llvm_unreachable("Expecting a vector of i1 types")::llvm::llvm_unreachable_internal("Expecting a vector of i1 types" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3565); | |||
| 3566 | } | |||
| 3567 | ||||
| 3568 | ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned); | |||
| 3569 | } | |||
| 3570 | return DAG.getBitcast(ValVT, ValReturned); | |||
| 3571 | } | |||
| 3572 | ||||
| 3573 | /// Lower the result values of a call into the | |||
| 3574 | /// appropriate copies out of appropriate physical registers. | |||
| 3575 | /// | |||
| 3576 | SDValue X86TargetLowering::LowerCallResult( | |||
| 3577 | SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, | |||
| 3578 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, | |||
| 3579 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, | |||
| 3580 | uint32_t *RegMask) const { | |||
| 3581 | ||||
| 3582 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 3583 | // Assign locations to each value returned by this call. | |||
| 3584 | SmallVector<CCValAssign, 16> RVLocs; | |||
| 3585 | CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, | |||
| 3586 | *DAG.getContext()); | |||
| 3587 | CCInfo.AnalyzeCallResult(Ins, RetCC_X86); | |||
| 3588 | ||||
| 3589 | // Copy all of the result registers out of their specified physreg. | |||
| 3590 | for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E; | |||
| 3591 | ++I, ++InsIndex) { | |||
| 3592 | CCValAssign &VA = RVLocs[I]; | |||
| 3593 | EVT CopyVT = VA.getLocVT(); | |||
| 3594 | ||||
| 3595 | // In some calling conventions we need to remove the used registers | |||
| 3596 | // from the register mask. | |||
| 3597 | if (RegMask) { | |||
| 3598 | for (MCPhysReg SubReg : TRI->subregs_inclusive(VA.getLocReg())) | |||
| 3599 | RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); | |||
| 3600 | } | |||
| 3601 | ||||
| 3602 | // Report an error if there was an attempt to return FP values via XMM | |||
| 3603 | // registers. | |||
| 3604 | if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) { | |||
| 3605 | errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); | |||
| 3606 | if (VA.getLocReg() == X86::XMM1) | |||
| 3607 | VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. | |||
| 3608 | else | |||
| 3609 | VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. | |||
| 3610 | } else if (!Subtarget.hasSSE2() && | |||
| 3611 | X86::FR64XRegClass.contains(VA.getLocReg()) && | |||
| 3612 | CopyVT == MVT::f64) { | |||
| 3613 | errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); | |||
| 3614 | if (VA.getLocReg() == X86::XMM1) | |||
| 3615 | VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts. | |||
| 3616 | else | |||
| 3617 | VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. | |||
| 3618 | } | |||
| 3619 | ||||
| 3620 | // If we prefer to use the value in xmm registers, copy it out as f80 and | |||
| 3621 | // use a truncate to move it from fp stack reg to xmm reg. | |||
| 3622 | bool RoundAfterCopy = false; | |||
| 3623 | if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) && | |||
| 3624 | isScalarFPTypeInSSEReg(VA.getValVT())) { | |||
| 3625 | if (!Subtarget.hasX87()) | |||
| 3626 | report_fatal_error("X87 register return with X87 disabled"); | |||
| 3627 | CopyVT = MVT::f80; | |||
| 3628 | RoundAfterCopy = (CopyVT != VA.getLocVT()); | |||
| 3629 | } | |||
| 3630 | ||||
| 3631 | SDValue Val; | |||
| 3632 | if (VA.needsCustom()) { | |||
| 3633 | assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__ __PRETTY_FUNCTION__)) | |||
| 3634 | "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 3634, __extension__ __PRETTY_FUNCTION__)); | |||
| 3635 | Val = | |||
| 3636 | getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InGlue); | |||
| 3637 | } else { | |||
| 3638 | Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InGlue) | |||
| 3639 | .getValue(1); | |||
| 3640 | Val = Chain.getValue(0); | |||
| 3641 | InGlue = Chain.getValue(2); | |||
| 3642 | } | |||
| 3643 | ||||
| 3644 | if (RoundAfterCopy) | |||
| 3645 | Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, | |||
| 3646 | // This truncation won't change the value. | |||
| 3647 | DAG.getIntPtrConstant(1, dl, /*isTarget=*/true)); | |||
| 3648 | ||||
| 3649 | if (VA.isExtInLoc()) { | |||
| 3650 | if (VA.getValVT().isVector() && | |||
| 3651 | VA.getValVT().getScalarType() == MVT::i1 && | |||
| 3652 | ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || | |||
| 3653 | (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { | |||
| 3654 | // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 | |||
| 3655 | Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG); | |||
| 3656 | } else | |||
| 3657 | Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val); | |||
| 3658 | } | |||
| 3659 | ||||
| 3660 | if (VA.getLocInfo() == CCValAssign::BCvt) | |||
| 3661 | Val = DAG.getBitcast(VA.getValVT(), Val); | |||
| 3662 | ||||
| 3663 | InVals.push_back(Val); | |||
| 3664 | } | |||
| 3665 | ||||
| 3666 | return Chain; | |||
| 3667 | } | |||
| 3668 | ||||
| 3669 | //===----------------------------------------------------------------------===// | |||
| 3670 | // C & StdCall & Fast Calling Convention implementation | |||
| 3671 | //===----------------------------------------------------------------------===// | |||
| 3672 | // StdCall calling convention seems to be standard for many Windows' API | |||
| 3673 | // routines and around. It differs from C calling convention just a little: | |||
| 3674 | // callee should clean up the stack, not caller. Symbols should be also | |||
| 3675 | // decorated in some fancy way :) It doesn't support any vector arguments. | |||
| 3676 | // For info on fast calling convention see Fast Calling Convention (tail call) | |||
| 3677 | // implementation LowerX86_32FastCCCallTo. | |||
| 3678 | ||||
| 3679 | /// Determines whether Args, either a set of outgoing arguments to a call, or a | |||
| 3680 | /// set of incoming args of a call, contains an sret pointer that the callee | |||
| 3681 | /// pops | |||
| 3682 | template <typename T> | |||
| 3683 | static bool hasCalleePopSRet(const SmallVectorImpl<T> &Args, | |||
| 3684 | const X86Subtarget &Subtarget) { | |||
| 3685 | // Not C++20 (yet), so no concepts available. | |||
| 3686 | static_assert(std::is_same_v<T, ISD::OutputArg> || | |||
| 3687 | std::is_same_v<T, ISD::InputArg>, | |||
| 3688 | "requires ISD::OutputArg or ISD::InputArg"); | |||
| 3689 | ||||
| 3690 | // Only 32-bit pops the sret. It's a 64-bit world these days, so early-out | |||
| 3691 | // for most compilations. | |||
| 3692 | if (!Subtarget.is32Bit()) | |||
| 3693 | return false; | |||
| 3694 | ||||
| 3695 | if (Args.empty()) | |||
| 3696 | return false; | |||
| 3697 | ||||
| 3698 | // Most calls do not have an sret argument, check the arg next. | |||
| 3699 | const ISD::ArgFlagsTy &Flags = Args[0].Flags; | |||
| 3700 | if (!Flags.isSRet() || Flags.isInReg()) | |||
| 3701 | return false; | |||
| 3702 | ||||
| 3703 | // The MSVCabi does not pop the sret. | |||
| 3704 | if (Subtarget.getTargetTriple().isOSMSVCRT()) | |||
| 3705 | return false; | |||
| 3706 | ||||
| 3707 | // MCUs don't pop the sret | |||
| 3708 | if (Subtarget.isTargetMCU()) | |||
| 3709 | return false; | |||
| 3710 | ||||
| 3711 | // Callee pops argument | |||
| 3712 | return true; | |||
| 3713 | } | |||
| 3714 | ||||
| 3715 | /// Make a copy of an aggregate at address specified by "Src" to address | |||
| 3716 | /// "Dst" with size and alignment information specified by the specific | |||
| 3717 | /// parameter attribute. The copy will be passed as a byval function parameter. | |||
| 3718 | static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, | |||
| 3719 | SDValue Chain, ISD::ArgFlagsTy Flags, | |||
| 3720 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 3721 | SDValue SizeNode = DAG.getIntPtrConstant(Flags.getByValSize(), dl); | |||
| 3722 | ||||
| 3723 | return DAG.getMemcpy( | |||
| 3724 | Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(), | |||
| 3725 | /*isVolatile*/ false, /*AlwaysInline=*/true, | |||
| 3726 | /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo()); | |||
| 3727 | } | |||
| 3728 | ||||
| 3729 | /// Return true if the calling convention is one that we can guarantee TCO for. | |||
| 3730 | static bool canGuaranteeTCO(CallingConv::ID CC) { | |||
| 3731 | return (CC == CallingConv::Fast || CC == CallingConv::GHC || | |||
| 3732 | CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE || | |||
| 3733 | CC == CallingConv::Tail || CC == CallingConv::SwiftTail); | |||
| 3734 | } | |||
| 3735 | ||||
| 3736 | /// Return true if we might ever do TCO for calls with this calling convention. | |||
| 3737 | static bool mayTailCallThisCC(CallingConv::ID CC) { | |||
| 3738 | switch (CC) { | |||
| 3739 | // C calling conventions: | |||
| 3740 | case CallingConv::C: | |||
| 3741 | case CallingConv::Win64: | |||
| 3742 | case CallingConv::X86_64_SysV: | |||
| 3743 | // Callee pop conventions: | |||
| 3744 | case CallingConv::X86_ThisCall: | |||
| 3745 | case CallingConv::X86_StdCall: | |||
| 3746 | case CallingConv::X86_VectorCall: | |||
| 3747 | case CallingConv::X86_FastCall: | |||
| 3748 | // Swift: | |||
| 3749 | case CallingConv::Swift: | |||
| 3750 | return true; | |||
| 3751 | default: | |||
| 3752 | return canGuaranteeTCO(CC); | |||
| 3753 | } | |||
| 3754 | } | |||
| 3755 | ||||
| 3756 | /// Return true if the function is being made into a tailcall target by | |||
| 3757 | /// changing its ABI. | |||
| 3758 | static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) { | |||
| 3759 | return (GuaranteedTailCallOpt && canGuaranteeTCO(CC)) || | |||
| 3760 | CC == CallingConv::Tail || CC == CallingConv::SwiftTail; | |||
| 3761 | } | |||
| 3762 | ||||
| 3763 | bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const { | |||
| 3764 | if (!CI->isTailCall()) | |||
| 3765 | return false; | |||
| 3766 | ||||
| 3767 | CallingConv::ID CalleeCC = CI->getCallingConv(); | |||
| 3768 | if (!mayTailCallThisCC(CalleeCC)) | |||
| 3769 | return false; | |||
| 3770 | ||||
| 3771 | return true; | |||
| 3772 | } | |||
| 3773 | ||||
| 3774 | SDValue | |||
| 3775 | X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, | |||
| 3776 | const SmallVectorImpl<ISD::InputArg> &Ins, | |||
| 3777 | const SDLoc &dl, SelectionDAG &DAG, | |||
| 3778 | const CCValAssign &VA, | |||
| 3779 | MachineFrameInfo &MFI, unsigned i) const { | |||
| 3780 | // Create the nodes corresponding to a load from this parameter slot. | |||
| 3781 | ISD::ArgFlagsTy Flags = Ins[i].Flags; | |||
| 3782 | bool AlwaysUseMutable = shouldGuaranteeTCO( | |||
| 3783 | CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); | |||
| 3784 | bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); | |||
| 3785 | EVT ValVT; | |||
| 3786 | MVT PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 3787 | ||||
| 3788 | // If value is passed by pointer we have address passed instead of the value | |||
| 3789 | // itself. No need to extend if the mask value and location share the same | |||
| 3790 | // absolute size. | |||
| 3791 | bool ExtendedInMem = | |||
| 3792 | VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 && | |||
| 3793 | VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits(); | |||
| 3794 | ||||
| 3795 | if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem) | |||
| 3796 | ValVT = VA.getLocVT(); | |||
| 3797 | else | |||
| 3798 | ValVT = VA.getValVT(); | |||
| 3799 | ||||
| 3800 | // FIXME: For now, all byval parameter objects are marked mutable. This can be | |||
| 3801 | // changed with more analysis. | |||
| 3802 | // In case of tail call optimization mark all arguments mutable. Since they | |||
| 3803 | // could be overwritten by lowering of arguments in case of a tail call. | |||
| 3804 | if (Flags.isByVal()) { | |||
| 3805 | unsigned Bytes = Flags.getByValSize(); | |||
| 3806 | if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. | |||
| 3807 | ||||
| 3808 | // FIXME: For now, all byval parameter objects are marked as aliasing. This | |||
| 3809 | // can be improved with deeper analysis. | |||
| 3810 | int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable, | |||
| 3811 | /*isAliased=*/true); | |||
| 3812 | return DAG.getFrameIndex(FI, PtrVT); | |||
| 3813 | } | |||
| 3814 | ||||
| 3815 | EVT ArgVT = Ins[i].ArgVT; | |||
| 3816 | ||||
| 3817 | // If this is a vector that has been split into multiple parts, and the | |||
| 3818 | // scalar size of the parts don't match the vector element size, then we can't | |||
| 3819 | // elide the copy. The parts will have padding between them instead of being | |||
| 3820 | // packed like a vector. | |||
| 3821 | bool ScalarizedAndExtendedVector = | |||
| 3822 | ArgVT.isVector() && !VA.getLocVT().isVector() && | |||
| 3823 | VA.getLocVT().getSizeInBits() != ArgVT.getScalarSizeInBits(); | |||
| 3824 | ||||
| 3825 | // This is an argument in memory. We might be able to perform copy elision. | |||
| 3826 | // If the argument is passed directly in memory without any extension, then we | |||
| 3827 | // can perform copy elision. Large vector types, for example, may be passed | |||
| 3828 | // indirectly by pointer. | |||
| 3829 | if (Flags.isCopyElisionCandidate() && | |||
| 3830 | VA.getLocInfo() != CCValAssign::Indirect && !ExtendedInMem && | |||
| 3831 | !ScalarizedAndExtendedVector) { | |||
| 3832 | SDValue PartAddr; | |||
| 3833 | if (Ins[i].PartOffset == 0) { | |||
| 3834 | // If this is a one-part value or the first part of a multi-part value, | |||
| 3835 | // create a stack object for the entire argument value type and return a | |||
| 3836 | // load from our portion of it. This assumes that if the first part of an | |||
| 3837 | // argument is in memory, the rest will also be in memory. | |||
| 3838 | int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(), | |||
| 3839 | /*IsImmutable=*/false); | |||
| 3840 | PartAddr = DAG.getFrameIndex(FI, PtrVT); | |||
| 3841 | return DAG.getLoad( | |||
| 3842 | ValVT, dl, Chain, PartAddr, | |||
| 3843 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); | |||
| 3844 | } else { | |||
| 3845 | // This is not the first piece of an argument in memory. See if there is | |||
| 3846 | // already a fixed stack object including this offset. If so, assume it | |||
| 3847 | // was created by the PartOffset == 0 branch above and create a load from | |||
| 3848 | // the appropriate offset into it. | |||
| 3849 | int64_t PartBegin = VA.getLocMemOffset(); | |||
| 3850 | int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8; | |||
| 3851 | int FI = MFI.getObjectIndexBegin(); | |||
| 3852 | for (; MFI.isFixedObjectIndex(FI); ++FI) { | |||
| 3853 | int64_t ObjBegin = MFI.getObjectOffset(FI); | |||
| 3854 | int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI); | |||
| 3855 | if (ObjBegin <= PartBegin && PartEnd <= ObjEnd) | |||
| 3856 | break; | |||
| 3857 | } | |||
| 3858 | if (MFI.isFixedObjectIndex(FI)) { | |||
| 3859 | SDValue Addr = | |||
| 3860 | DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT), | |||
| 3861 | DAG.getIntPtrConstant(Ins[i].PartOffset, dl)); | |||
| 3862 | return DAG.getLoad( | |||
| 3863 | ValVT, dl, Chain, Addr, | |||
| 3864 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI, | |||
| 3865 | Ins[i].PartOffset)); | |||
| 3866 | } | |||
| 3867 | } | |||
| 3868 | } | |||
| 3869 | ||||
| 3870 | int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8, | |||
| 3871 | VA.getLocMemOffset(), isImmutable); | |||
| 3872 | ||||
| 3873 | // Set SExt or ZExt flag. | |||
| 3874 | if (VA.getLocInfo() == CCValAssign::ZExt) { | |||
| 3875 | MFI.setObjectZExt(FI, true); | |||
| 3876 | } else if (VA.getLocInfo() == CCValAssign::SExt) { | |||
| 3877 | MFI.setObjectSExt(FI, true); | |||
| 3878 | } | |||
| 3879 | ||||
| 3880 | MaybeAlign Alignment; | |||
| 3881 | if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && | |||
| 3882 | ValVT != MVT::f80) | |||
| 3883 | Alignment = MaybeAlign(4); | |||
| 3884 | SDValue FIN = DAG.getFrameIndex(FI, PtrVT); | |||
| 3885 | SDValue Val = DAG.getLoad( | |||
| 3886 | ValVT, dl, Chain, FIN, | |||
| 3887 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), | |||
| 3888 | Alignment); | |||
| 3889 | return ExtendedInMem | |||
| 3890 | ? (VA.getValVT().isVector() | |||
| 3891 | ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val) | |||
| 3892 | : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)) | |||
| 3893 | : Val; | |||
| 3894 | } | |||
| 3895 | ||||
| 3896 | // FIXME: Get this from tablegen. | |||
| 3897 | static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv, | |||
| 3898 | const X86Subtarget &Subtarget) { | |||
| 3899 | assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 3899, __extension__ __PRETTY_FUNCTION__)); | |||
| 3900 | ||||
| 3901 | if (Subtarget.isCallingConvWin64(CallConv)) { | |||
| 3902 | static const MCPhysReg GPR64ArgRegsWin64[] = { | |||
| 3903 | X86::RCX, X86::RDX, X86::R8, X86::R9 | |||
| 3904 | }; | |||
| 3905 | return ArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64)); | |||
| 3906 | } | |||
| 3907 | ||||
| 3908 | static const MCPhysReg GPR64ArgRegs64Bit[] = { | |||
| 3909 | X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 | |||
| 3910 | }; | |||
| 3911 | return ArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit)); | |||
| 3912 | } | |||
| 3913 | ||||
| 3914 | // FIXME: Get this from tablegen. | |||
| 3915 | static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, | |||
| 3916 | CallingConv::ID CallConv, | |||
| 3917 | const X86Subtarget &Subtarget) { | |||
| 3918 | assert(Subtarget.is64Bit())(static_cast <bool> (Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.is64Bit()", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 3918, __extension__ __PRETTY_FUNCTION__)); | |||
| 3919 | if (Subtarget.isCallingConvWin64(CallConv)) { | |||
| 3920 | // The XMM registers which might contain var arg parameters are shadowed | |||
| 3921 | // in their paired GPR. So we only need to save the GPR to their home | |||
| 3922 | // slots. | |||
| 3923 | // TODO: __vectorcall will change this. | |||
| 3924 | return std::nullopt; | |||
| 3925 | } | |||
| 3926 | ||||
| 3927 | bool isSoftFloat = Subtarget.useSoftFloat(); | |||
| 3928 | if (isSoftFloat || !Subtarget.hasSSE1()) | |||
| 3929 | // Kernel mode asks for SSE to be disabled, so there are no XMM argument | |||
| 3930 | // registers. | |||
| 3931 | return std::nullopt; | |||
| 3932 | ||||
| 3933 | static const MCPhysReg XMMArgRegs64Bit[] = { | |||
| 3934 | X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, | |||
| 3935 | X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 | |||
| 3936 | }; | |||
| 3937 | return ArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit)); | |||
| 3938 | } | |||
| 3939 | ||||
| 3940 | #ifndef NDEBUG | |||
| 3941 | static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) { | |||
| 3942 | return llvm::is_sorted( | |||
| 3943 | ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool { | |||
| 3944 | return A.getValNo() < B.getValNo(); | |||
| 3945 | }); | |||
| 3946 | } | |||
| 3947 | #endif | |||
| 3948 | ||||
| 3949 | namespace { | |||
| 3950 | /// This is a helper class for lowering variable arguments parameters. | |||
| 3951 | class VarArgsLoweringHelper { | |||
| 3952 | public: | |||
| 3953 | VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc, | |||
| 3954 | SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 3955 | CallingConv::ID CallConv, CCState &CCInfo) | |||
| 3956 | : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget), | |||
| 3957 | TheMachineFunction(DAG.getMachineFunction()), | |||
| 3958 | TheFunction(TheMachineFunction.getFunction()), | |||
| 3959 | FrameInfo(TheMachineFunction.getFrameInfo()), | |||
| 3960 | FrameLowering(*Subtarget.getFrameLowering()), | |||
| 3961 | TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv), | |||
| 3962 | CCInfo(CCInfo) {} | |||
| 3963 | ||||
| 3964 | // Lower variable arguments parameters. | |||
| 3965 | void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize); | |||
| 3966 | ||||
| 3967 | private: | |||
| 3968 | void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize); | |||
| 3969 | ||||
| 3970 | void forwardMustTailParameters(SDValue &Chain); | |||
| 3971 | ||||
| 3972 | bool is64Bit() const { return Subtarget.is64Bit(); } | |||
| 3973 | bool isWin64() const { return Subtarget.isCallingConvWin64(CallConv); } | |||
| 3974 | ||||
| 3975 | X86MachineFunctionInfo *FuncInfo; | |||
| 3976 | const SDLoc &DL; | |||
| 3977 | SelectionDAG &DAG; | |||
| 3978 | const X86Subtarget &Subtarget; | |||
| 3979 | MachineFunction &TheMachineFunction; | |||
| 3980 | const Function &TheFunction; | |||
| 3981 | MachineFrameInfo &FrameInfo; | |||
| 3982 | const TargetFrameLowering &FrameLowering; | |||
| 3983 | const TargetLowering &TargLowering; | |||
| 3984 | CallingConv::ID CallConv; | |||
| 3985 | CCState &CCInfo; | |||
| 3986 | }; | |||
| 3987 | } // namespace | |||
| 3988 | ||||
| 3989 | void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters( | |||
| 3990 | SDValue &Chain, unsigned StackSize) { | |||
| 3991 | // If the function takes variable number of arguments, make a frame index for | |||
| 3992 | // the start of the first vararg value... for expansion of llvm.va_start. We | |||
| 3993 | // can skip this if there are no va_start calls. | |||
| 3994 | if (is64Bit() || (CallConv != CallingConv::X86_FastCall && | |||
| 3995 | CallConv != CallingConv::X86_ThisCall)) { | |||
| 3996 | FuncInfo->setVarArgsFrameIndex( | |||
| 3997 | FrameInfo.CreateFixedObject(1, StackSize, true)); | |||
| 3998 | } | |||
| 3999 | ||||
| 4000 | // 64-bit calling conventions support varargs and register parameters, so we | |||
| 4001 | // have to do extra work to spill them in the prologue. | |||
| 4002 | if (is64Bit()) { | |||
| 4003 | // Find the first unallocated argument registers. | |||
| 4004 | ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget); | |||
| 4005 | ArrayRef<MCPhysReg> ArgXMMs = | |||
| 4006 | get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget); | |||
| 4007 | unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs); | |||
| 4008 | unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs); | |||
| 4009 | ||||
| 4010 | assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&(static_cast <bool> (!(NumXMMRegs && !Subtarget .hasSSE1()) && "SSE register cannot be used when SSE is disabled!" ) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__ __PRETTY_FUNCTION__)) | |||
| 4011 | "SSE register cannot be used when SSE is disabled!")(static_cast <bool> (!(NumXMMRegs && !Subtarget .hasSSE1()) && "SSE register cannot be used when SSE is disabled!" ) ? void (0) : __assert_fail ("!(NumXMMRegs && !Subtarget.hasSSE1()) && \"SSE register cannot be used when SSE is disabled!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4011, __extension__ __PRETTY_FUNCTION__)); | |||
| 4012 | ||||
| 4013 | if (isWin64()) { | |||
| 4014 | // Get to the caller-allocated home save location. Add 8 to account | |||
| 4015 | // for the return address. | |||
| 4016 | int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8; | |||
| 4017 | FuncInfo->setRegSaveFrameIndex( | |||
| 4018 | FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); | |||
| 4019 | // Fixup to set vararg frame on shadow area (4 x i64). | |||
| 4020 | if (NumIntRegs < 4) | |||
| 4021 | FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); | |||
| 4022 | } else { | |||
| 4023 | // For X86-64, if there are vararg parameters that are passed via | |||
| 4024 | // registers, then we must store them to their spots on the stack so | |||
| 4025 | // they may be loaded by dereferencing the result of va_next. | |||
| 4026 | FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); | |||
| 4027 | FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16); | |||
| 4028 | FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject( | |||
| 4029 | ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false)); | |||
| 4030 | } | |||
| 4031 | ||||
| 4032 | SmallVector<SDValue, 6> | |||
| 4033 | LiveGPRs; // list of SDValue for GPR registers keeping live input value | |||
| 4034 | SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers | |||
| 4035 | // keeping live input value | |||
| 4036 | SDValue ALVal; // if applicable keeps SDValue for %al register | |||
| 4037 | ||||
| 4038 | // Gather all the live in physical registers. | |||
| 4039 | for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) { | |||
| 4040 | Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass); | |||
| 4041 | LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64)); | |||
| 4042 | } | |||
| 4043 | const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs); | |||
| 4044 | if (!AvailableXmms.empty()) { | |||
| 4045 | Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); | |||
| 4046 | ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8); | |||
| 4047 | for (MCPhysReg Reg : AvailableXmms) { | |||
| 4048 | // FastRegisterAllocator spills virtual registers at basic | |||
| 4049 | // block boundary. That leads to usages of xmm registers | |||
| 4050 | // outside of check for %al. Pass physical registers to | |||
| 4051 | // VASTART_SAVE_XMM_REGS to avoid unneccessary spilling. | |||
| 4052 | TheMachineFunction.getRegInfo().addLiveIn(Reg); | |||
| 4053 | LiveXMMRegs.push_back(DAG.getRegister(Reg, MVT::v4f32)); | |||
| 4054 | } | |||
| 4055 | } | |||
| 4056 | ||||
| 4057 | // Store the integer parameter registers. | |||
| 4058 | SmallVector<SDValue, 8> MemOps; | |||
| 4059 | SDValue RSFIN = | |||
| 4060 | DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), | |||
| 4061 | TargLowering.getPointerTy(DAG.getDataLayout())); | |||
| 4062 | unsigned Offset = FuncInfo->getVarArgsGPOffset(); | |||
| 4063 | for (SDValue Val : LiveGPRs) { | |||
| 4064 | SDValue FIN = DAG.getNode(ISD::ADD, DL, | |||
| 4065 | TargLowering.getPointerTy(DAG.getDataLayout()), | |||
| 4066 | RSFIN, DAG.getIntPtrConstant(Offset, DL)); | |||
| 4067 | SDValue Store = | |||
| 4068 | DAG.getStore(Val.getValue(1), DL, Val, FIN, | |||
| 4069 | MachinePointerInfo::getFixedStack( | |||
| 4070 | DAG.getMachineFunction(), | |||
| 4071 | FuncInfo->getRegSaveFrameIndex(), Offset)); | |||
| 4072 | MemOps.push_back(Store); | |||
| 4073 | Offset += 8; | |||
| 4074 | } | |||
| 4075 | ||||
| 4076 | // Now store the XMM (fp + vector) parameter registers. | |||
| 4077 | if (!LiveXMMRegs.empty()) { | |||
| 4078 | SmallVector<SDValue, 12> SaveXMMOps; | |||
| 4079 | SaveXMMOps.push_back(Chain); | |||
| 4080 | SaveXMMOps.push_back(ALVal); | |||
| 4081 | SaveXMMOps.push_back(RSFIN); | |||
| 4082 | SaveXMMOps.push_back( | |||
| 4083 | DAG.getTargetConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32)); | |||
| 4084 | llvm::append_range(SaveXMMOps, LiveXMMRegs); | |||
| 4085 | MachineMemOperand *StoreMMO = | |||
| 4086 | DAG.getMachineFunction().getMachineMemOperand( | |||
| 4087 | MachinePointerInfo::getFixedStack( | |||
| 4088 | DAG.getMachineFunction(), FuncInfo->getRegSaveFrameIndex(), | |||
| 4089 | Offset), | |||
| 4090 | MachineMemOperand::MOStore, 128, Align(16)); | |||
| 4091 | MemOps.push_back(DAG.getMemIntrinsicNode(X86ISD::VASTART_SAVE_XMM_REGS, | |||
| 4092 | DL, DAG.getVTList(MVT::Other), | |||
| 4093 | SaveXMMOps, MVT::i8, StoreMMO)); | |||
| 4094 | } | |||
| 4095 | ||||
| 4096 | if (!MemOps.empty()) | |||
| 4097 | Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); | |||
| 4098 | } | |||
| 4099 | } | |||
| 4100 | ||||
| 4101 | void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) { | |||
| 4102 | // Find the largest legal vector type. | |||
| 4103 | MVT VecVT = MVT::Other; | |||
| 4104 | // FIXME: Only some x86_32 calling conventions support AVX512. | |||
| 4105 | if (Subtarget.useAVX512Regs() && | |||
| 4106 | (is64Bit() || (CallConv == CallingConv::X86_VectorCall || | |||
| 4107 | CallConv == CallingConv::Intel_OCL_BI))) | |||
| 4108 | VecVT = MVT::v16f32; | |||
| 4109 | else if (Subtarget.hasAVX()) | |||
| 4110 | VecVT = MVT::v8f32; | |||
| 4111 | else if (Subtarget.hasSSE2()) | |||
| 4112 | VecVT = MVT::v4f32; | |||
| 4113 | ||||
| 4114 | // We forward some GPRs and some vector types. | |||
| 4115 | SmallVector<MVT, 2> RegParmTypes; | |||
| 4116 | MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32; | |||
| 4117 | RegParmTypes.push_back(IntVT); | |||
| 4118 | if (VecVT != MVT::Other) | |||
| 4119 | RegParmTypes.push_back(VecVT); | |||
| 4120 | ||||
| 4121 | // Compute the set of forwarded registers. The rest are scratch. | |||
| 4122 | SmallVectorImpl<ForwardedRegister> &Forwards = | |||
| 4123 | FuncInfo->getForwardedMustTailRegParms(); | |||
| 4124 | CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86); | |||
| 4125 | ||||
| 4126 | // Forward AL for SysV x86_64 targets, since it is used for varargs. | |||
| 4127 | if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) { | |||
| 4128 | Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass); | |||
| 4129 | Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8)); | |||
| 4130 | } | |||
| 4131 | ||||
| 4132 | // Copy all forwards from physical to virtual registers. | |||
| 4133 | for (ForwardedRegister &FR : Forwards) { | |||
| 4134 | // FIXME: Can we use a less constrained schedule? | |||
| 4135 | SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT); | |||
| 4136 | FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister( | |||
| 4137 | TargLowering.getRegClassFor(FR.VT)); | |||
| 4138 | Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal); | |||
| 4139 | } | |||
| 4140 | } | |||
| 4141 | ||||
| 4142 | void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain, | |||
| 4143 | unsigned StackSize) { | |||
| 4144 | // Set FrameIndex to the 0xAAAAAAA value to mark unset state. | |||
| 4145 | // If necessary, it would be set into the correct value later. | |||
| 4146 | FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); | |||
| 4147 | FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); | |||
| 4148 | ||||
| 4149 | if (FrameInfo.hasVAStart()) | |||
| 4150 | createVarArgAreaAndStoreRegisters(Chain, StackSize); | |||
| 4151 | ||||
| 4152 | if (FrameInfo.hasMustTailInVarArgFunc()) | |||
| 4153 | forwardMustTailParameters(Chain); | |||
| 4154 | } | |||
| 4155 | ||||
| 4156 | SDValue X86TargetLowering::LowerFormalArguments( | |||
| 4157 | SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, | |||
| 4158 | const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, | |||
| 4159 | SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { | |||
| 4160 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 4161 | X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 4162 | ||||
| 4163 | const Function &F = MF.getFunction(); | |||
| 4164 | if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && | |||
| 4165 | F.getName() == "main") | |||
| 4166 | FuncInfo->setForceFramePointer(true); | |||
| 4167 | ||||
| 4168 | MachineFrameInfo &MFI = MF.getFrameInfo(); | |||
| 4169 | bool Is64Bit = Subtarget.is64Bit(); | |||
| 4170 | bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); | |||
| 4171 | ||||
| 4172 | assert((static_cast <bool> (!(IsVarArg && canGuaranteeTCO (CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__ __PRETTY_FUNCTION__)) | |||
| 4173 | !(IsVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(IsVarArg && canGuaranteeTCO (CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__ __PRETTY_FUNCTION__)) | |||
| 4174 | "Var args not supported with calling conv' regcall, fastcc, ghc or hipe")(static_cast <bool> (!(IsVarArg && canGuaranteeTCO (CallConv)) && "Var args not supported with calling conv' regcall, fastcc, ghc or hipe" ) ? void (0) : __assert_fail ("!(IsVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling conv' regcall, fastcc, ghc or hipe\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4174, __extension__ __PRETTY_FUNCTION__)); | |||
| 4175 | ||||
| 4176 | // Assign locations to all of the incoming arguments. | |||
| 4177 | SmallVector<CCValAssign, 16> ArgLocs; | |||
| 4178 | CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext()); | |||
| 4179 | ||||
| 4180 | // Allocate shadow area for Win64. | |||
| 4181 | if (IsWin64) | |||
| 4182 | CCInfo.AllocateStack(32, Align(8)); | |||
| 4183 | ||||
| 4184 | CCInfo.AnalyzeArguments(Ins, CC_X86); | |||
| 4185 | ||||
| 4186 | // In vectorcall calling convention a second pass is required for the HVA | |||
| 4187 | // types. | |||
| 4188 | if (CallingConv::X86_VectorCall == CallConv) { | |||
| 4189 | CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86); | |||
| 4190 | } | |||
| 4191 | ||||
| 4192 | // The next loop assumes that the locations are in the same order of the | |||
| 4193 | // input arguments. | |||
| 4194 | assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering") ? void (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__ __PRETTY_FUNCTION__)) | |||
| 4195 | "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering") ? void (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4195, __extension__ __PRETTY_FUNCTION__)); | |||
| 4196 | ||||
| 4197 | SDValue ArgValue; | |||
| 4198 | for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E; | |||
| 4199 | ++I, ++InsIndex) { | |||
| 4200 | assert(InsIndex < Ins.size() && "Invalid Ins index")(static_cast <bool> (InsIndex < Ins.size() && "Invalid Ins index") ? void (0) : __assert_fail ("InsIndex < Ins.size() && \"Invalid Ins index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4200, __extension__ __PRETTY_FUNCTION__)); | |||
| 4201 | CCValAssign &VA = ArgLocs[I]; | |||
| 4202 | ||||
| 4203 | if (VA.isRegLoc()) { | |||
| 4204 | EVT RegVT = VA.getLocVT(); | |||
| 4205 | if (VA.needsCustom()) { | |||
| 4206 | assert((static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__ __PRETTY_FUNCTION__)) | |||
| 4207 | VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__ __PRETTY_FUNCTION__)) | |||
| 4208 | "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4208, __extension__ __PRETTY_FUNCTION__)); | |||
| 4209 | ||||
| 4210 | // v64i1 values, in regcall calling convention, that are | |||
| 4211 | // compiled to 32 bit arch, are split up into two registers. | |||
| 4212 | ArgValue = | |||
| 4213 | getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget); | |||
| 4214 | } else { | |||
| 4215 | const TargetRegisterClass *RC; | |||
| 4216 | if (RegVT == MVT::i8) | |||
| 4217 | RC = &X86::GR8RegClass; | |||
| 4218 | else if (RegVT == MVT::i16) | |||
| 4219 | RC = &X86::GR16RegClass; | |||
| 4220 | else if (RegVT == MVT::i32) | |||
| 4221 | RC = &X86::GR32RegClass; | |||
| 4222 | else if (Is64Bit && RegVT == MVT::i64) | |||
| 4223 | RC = &X86::GR64RegClass; | |||
| 4224 | else if (RegVT == MVT::f16) | |||
| 4225 | RC = Subtarget.hasAVX512() ? &X86::FR16XRegClass : &X86::FR16RegClass; | |||
| 4226 | else if (RegVT == MVT::f32) | |||
| 4227 | RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; | |||
| 4228 | else if (RegVT == MVT::f64) | |||
| 4229 | RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; | |||
| 4230 | else if (RegVT == MVT::f80) | |||
| 4231 | RC = &X86::RFP80RegClass; | |||
| 4232 | else if (RegVT == MVT::f128) | |||
| 4233 | RC = &X86::VR128RegClass; | |||
| 4234 | else if (RegVT.is512BitVector()) | |||
| 4235 | RC = &X86::VR512RegClass; | |||
| 4236 | else if (RegVT.is256BitVector()) | |||
| 4237 | RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass; | |||
| 4238 | else if (RegVT.is128BitVector()) | |||
| 4239 | RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass; | |||
| 4240 | else if (RegVT == MVT::x86mmx) | |||
| 4241 | RC = &X86::VR64RegClass; | |||
| 4242 | else if (RegVT == MVT::v1i1) | |||
| 4243 | RC = &X86::VK1RegClass; | |||
| 4244 | else if (RegVT == MVT::v8i1) | |||
| 4245 | RC = &X86::VK8RegClass; | |||
| 4246 | else if (RegVT == MVT::v16i1) | |||
| 4247 | RC = &X86::VK16RegClass; | |||
| 4248 | else if (RegVT == MVT::v32i1) | |||
| 4249 | RC = &X86::VK32RegClass; | |||
| 4250 | else if (RegVT == MVT::v64i1) | |||
| 4251 | RC = &X86::VK64RegClass; | |||
| 4252 | else | |||
| 4253 | llvm_unreachable("Unknown argument type!")::llvm::llvm_unreachable_internal("Unknown argument type!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 4253); | |||
| 4254 | ||||
| 4255 | Register Reg = MF.addLiveIn(VA.getLocReg(), RC); | |||
| 4256 | ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); | |||
| 4257 | } | |||
| 4258 | ||||
| 4259 | // If this is an 8 or 16-bit value, it is really passed promoted to 32 | |||
| 4260 | // bits. Insert an assert[sz]ext to capture this, then truncate to the | |||
| 4261 | // right size. | |||
| 4262 | if (VA.getLocInfo() == CCValAssign::SExt) | |||
| 4263 | ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, | |||
| 4264 | DAG.getValueType(VA.getValVT())); | |||
| 4265 | else if (VA.getLocInfo() == CCValAssign::ZExt) | |||
| 4266 | ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, | |||
| 4267 | DAG.getValueType(VA.getValVT())); | |||
| 4268 | else if (VA.getLocInfo() == CCValAssign::BCvt) | |||
| 4269 | ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue); | |||
| 4270 | ||||
| 4271 | if (VA.isExtInLoc()) { | |||
| 4272 | // Handle MMX values passed in XMM regs. | |||
| 4273 | if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1) | |||
| 4274 | ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); | |||
| 4275 | else if (VA.getValVT().isVector() && | |||
| 4276 | VA.getValVT().getScalarType() == MVT::i1 && | |||
| 4277 | ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) || | |||
| 4278 | (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) { | |||
| 4279 | // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8 | |||
| 4280 | ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG); | |||
| 4281 | } else | |||
| 4282 | ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); | |||
| 4283 | } | |||
| 4284 | } else { | |||
| 4285 | assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail ("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4285, __extension__ __PRETTY_FUNCTION__)); | |||
| 4286 | ArgValue = | |||
| 4287 | LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex); | |||
| 4288 | } | |||
| 4289 | ||||
| 4290 | // If value is passed via pointer - do a load. | |||
| 4291 | if (VA.getLocInfo() == CCValAssign::Indirect && !Ins[I].Flags.isByVal()) | |||
| 4292 | ArgValue = | |||
| 4293 | DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo()); | |||
| 4294 | ||||
| 4295 | InVals.push_back(ArgValue); | |||
| 4296 | } | |||
| 4297 | ||||
| 4298 | for (unsigned I = 0, E = Ins.size(); I != E; ++I) { | |||
| 4299 | if (Ins[I].Flags.isSwiftAsync()) { | |||
| 4300 | auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 4301 | if (Subtarget.is64Bit()) | |||
| 4302 | X86FI->setHasSwiftAsyncContext(true); | |||
| 4303 | else { | |||
| 4304 | int FI = MF.getFrameInfo().CreateStackObject(4, Align(4), false); | |||
| 4305 | X86FI->setSwiftAsyncContextFrameIdx(FI); | |||
| 4306 | SDValue St = DAG.getStore(DAG.getEntryNode(), dl, InVals[I], | |||
| 4307 | DAG.getFrameIndex(FI, MVT::i32), | |||
| 4308 | MachinePointerInfo::getFixedStack(MF, FI)); | |||
| 4309 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, St, Chain); | |||
| 4310 | } | |||
| 4311 | } | |||
| 4312 | ||||
| 4313 | // Swift calling convention does not require we copy the sret argument | |||
| 4314 | // into %rax/%eax for the return. We don't set SRetReturnReg for Swift. | |||
| 4315 | if (CallConv == CallingConv::Swift || CallConv == CallingConv::SwiftTail) | |||
| 4316 | continue; | |||
| 4317 | ||||
| 4318 | // All x86 ABIs require that for returning structs by value we copy the | |||
| 4319 | // sret argument into %rax/%eax (depending on ABI) for the return. Save | |||
| 4320 | // the argument into a virtual register so that we can access it from the | |||
| 4321 | // return points. | |||
| 4322 | if (Ins[I].Flags.isSRet()) { | |||
| 4323 | assert(!FuncInfo->getSRetReturnReg() &&(static_cast <bool> (!FuncInfo->getSRetReturnReg() && "SRet return has already been set") ? void (0) : __assert_fail ("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__ __PRETTY_FUNCTION__)) | |||
| 4324 | "SRet return has already been set")(static_cast <bool> (!FuncInfo->getSRetReturnReg() && "SRet return has already been set") ? void (0) : __assert_fail ("!FuncInfo->getSRetReturnReg() && \"SRet return has already been set\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4324, __extension__ __PRETTY_FUNCTION__)); | |||
| 4325 | MVT PtrTy = getPointerTy(DAG.getDataLayout()); | |||
| 4326 | Register Reg = | |||
| 4327 | MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); | |||
| 4328 | FuncInfo->setSRetReturnReg(Reg); | |||
| 4329 | SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]); | |||
| 4330 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); | |||
| 4331 | break; | |||
| 4332 | } | |||
| 4333 | } | |||
| 4334 | ||||
| 4335 | unsigned StackSize = CCInfo.getNextStackOffset(); | |||
| 4336 | // Align stack specially for tail calls. | |||
| 4337 | if (shouldGuaranteeTCO(CallConv, | |||
| 4338 | MF.getTarget().Options.GuaranteedTailCallOpt)) | |||
| 4339 | StackSize = GetAlignedArgumentStackSize(StackSize, DAG); | |||
| 4340 | ||||
| 4341 | if (IsVarArg) | |||
| 4342 | VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo) | |||
| 4343 | .lowerVarArgsParameters(Chain, StackSize); | |||
| 4344 | ||||
| 4345 | // Some CCs need callee pop. | |||
| 4346 | if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg, | |||
| 4347 | MF.getTarget().Options.GuaranteedTailCallOpt)) { | |||
| 4348 | FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. | |||
| 4349 | } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) { | |||
| 4350 | // X86 interrupts must pop the error code (and the alignment padding) if | |||
| 4351 | // present. | |||
| 4352 | FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4); | |||
| 4353 | } else { | |||
| 4354 | FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. | |||
| 4355 | // If this is an sret function, the return should pop the hidden pointer. | |||
| 4356 | if (!canGuaranteeTCO(CallConv) && hasCalleePopSRet(Ins, Subtarget)) | |||
| 4357 | FuncInfo->setBytesToPopOnReturn(4); | |||
| 4358 | } | |||
| 4359 | ||||
| 4360 | if (!Is64Bit) { | |||
| 4361 | // RegSaveFrameIndex is X86-64 only. | |||
| 4362 | FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); | |||
| 4363 | } | |||
| 4364 | ||||
| 4365 | FuncInfo->setArgumentStackSize(StackSize); | |||
| 4366 | ||||
| 4367 | if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { | |||
| 4368 | EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); | |||
| 4369 | if (Personality == EHPersonality::CoreCLR) { | |||
| 4370 | assert(Is64Bit)(static_cast <bool> (Is64Bit) ? void (0) : __assert_fail ("Is64Bit", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4370, __extension__ __PRETTY_FUNCTION__)); | |||
| 4371 | // TODO: Add a mechanism to frame lowering that will allow us to indicate | |||
| 4372 | // that we'd prefer this slot be allocated towards the bottom of the frame | |||
| 4373 | // (i.e. near the stack pointer after allocating the frame). Every | |||
| 4374 | // funclet needs a copy of this slot in its (mostly empty) frame, and the | |||
| 4375 | // offset from the bottom of this and each funclet's frame must be the | |||
| 4376 | // same, so the size of funclets' (mostly empty) frames is dictated by | |||
| 4377 | // how far this slot is from the bottom (since they allocate just enough | |||
| 4378 | // space to accommodate holding this slot at the correct offset). | |||
| 4379 | int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSpillSlot=*/false); | |||
| 4380 | EHInfo->PSPSymFrameIdx = PSPSymFI; | |||
| 4381 | } | |||
| 4382 | } | |||
| 4383 | ||||
| 4384 | if (shouldDisableArgRegFromCSR(CallConv) || | |||
| 4385 | F.hasFnAttribute("no_caller_saved_registers")) { | |||
| 4386 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
| 4387 | for (std::pair<Register, Register> Pair : MRI.liveins()) | |||
| 4388 | MRI.disableCalleeSavedRegister(Pair.first); | |||
| 4389 | } | |||
| 4390 | ||||
| 4391 | return Chain; | |||
| 4392 | } | |||
| 4393 | ||||
| 4394 | SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, | |||
| 4395 | SDValue Arg, const SDLoc &dl, | |||
| 4396 | SelectionDAG &DAG, | |||
| 4397 | const CCValAssign &VA, | |||
| 4398 | ISD::ArgFlagsTy Flags, | |||
| 4399 | bool isByVal) const { | |||
| 4400 | unsigned LocMemOffset = VA.getLocMemOffset(); | |||
| 4401 | SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl); | |||
| 4402 | PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), | |||
| 4403 | StackPtr, PtrOff); | |||
| 4404 | if (isByVal) | |||
| 4405 | return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); | |||
| 4406 | ||||
| 4407 | MaybeAlign Alignment; | |||
| 4408 | if (Subtarget.isTargetWindowsMSVC() && !Subtarget.is64Bit() && | |||
| 4409 | Arg.getSimpleValueType() != MVT::f80) | |||
| 4410 | Alignment = MaybeAlign(4); | |||
| 4411 | return DAG.getStore( | |||
| 4412 | Chain, dl, Arg, PtrOff, | |||
| 4413 | MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset), | |||
| 4414 | Alignment); | |||
| 4415 | } | |||
| 4416 | ||||
| 4417 | /// Emit a load of return address if tail call | |||
| 4418 | /// optimization is performed and it is required. | |||
| 4419 | SDValue X86TargetLowering::EmitTailCallLoadRetAddr( | |||
| 4420 | SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, | |||
| 4421 | bool Is64Bit, int FPDiff, const SDLoc &dl) const { | |||
| 4422 | // Adjust the Return address stack slot. | |||
| 4423 | EVT VT = getPointerTy(DAG.getDataLayout()); | |||
| 4424 | OutRetAddr = getReturnAddressFrameIndex(DAG); | |||
| 4425 | ||||
| 4426 | // Load the "old" Return address. | |||
| 4427 | OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo()); | |||
| 4428 | return SDValue(OutRetAddr.getNode(), 1); | |||
| 4429 | } | |||
| 4430 | ||||
| 4431 | /// Emit a store of the return address if tail call | |||
| 4432 | /// optimization is performed and it is required (FPDiff!=0). | |||
| 4433 | static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, | |||
| 4434 | SDValue Chain, SDValue RetAddrFrIdx, | |||
| 4435 | EVT PtrVT, unsigned SlotSize, | |||
| 4436 | int FPDiff, const SDLoc &dl) { | |||
| 4437 | // Store the return address to the appropriate stack slot. | |||
| 4438 | if (!FPDiff) return Chain; | |||
| 4439 | // Calculate the new stack slot for the return address. | |||
| 4440 | int NewReturnAddrFI = | |||
| 4441 | MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, | |||
| 4442 | false); | |||
| 4443 | SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); | |||
| 4444 | Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, | |||
| 4445 | MachinePointerInfo::getFixedStack( | |||
| 4446 | DAG.getMachineFunction(), NewReturnAddrFI)); | |||
| 4447 | return Chain; | |||
| 4448 | } | |||
| 4449 | ||||
| 4450 | /// Returns a vector_shuffle mask for an movs{s|d}, movd | |||
| 4451 | /// operation of specified width. | |||
| 4452 | static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, | |||
| 4453 | SDValue V2) { | |||
| 4454 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 4455 | SmallVector<int, 8> Mask; | |||
| 4456 | Mask.push_back(NumElems); | |||
| 4457 | for (unsigned i = 1; i != NumElems; ++i) | |||
| 4458 | Mask.push_back(i); | |||
| 4459 | return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); | |||
| 4460 | } | |||
| 4461 | ||||
| 4462 | SDValue | |||
| 4463 | X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, | |||
| 4464 | SmallVectorImpl<SDValue> &InVals) const { | |||
| 4465 | SelectionDAG &DAG = CLI.DAG; | |||
| 4466 | SDLoc &dl = CLI.DL; | |||
| 4467 | SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs; | |||
| 4468 | SmallVectorImpl<SDValue> &OutVals = CLI.OutVals; | |||
| 4469 | SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins; | |||
| 4470 | SDValue Chain = CLI.Chain; | |||
| 4471 | SDValue Callee = CLI.Callee; | |||
| 4472 | CallingConv::ID CallConv = CLI.CallConv; | |||
| 4473 | bool &isTailCall = CLI.IsTailCall; | |||
| 4474 | bool isVarArg = CLI.IsVarArg; | |||
| 4475 | const auto *CB = CLI.CB; | |||
| 4476 | ||||
| 4477 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 4478 | bool Is64Bit = Subtarget.is64Bit(); | |||
| 4479 | bool IsWin64 = Subtarget.isCallingConvWin64(CallConv); | |||
| 4480 | bool IsSibcall = false; | |||
| 4481 | bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt || | |||
| 4482 | CallConv == CallingConv::Tail || CallConv == CallingConv::SwiftTail; | |||
| 4483 | bool IsCalleePopSRet = !IsGuaranteeTCO && hasCalleePopSRet(Outs, Subtarget); | |||
| 4484 | X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 4485 | bool HasNCSR = (CB && isa<CallInst>(CB) && | |||
| 4486 | CB->hasFnAttr("no_caller_saved_registers")); | |||
| 4487 | bool HasNoCfCheck = (CB && CB->doesNoCfCheck()); | |||
| 4488 | bool IsIndirectCall = (CB && isa<CallInst>(CB) && CB->isIndirectCall()); | |||
| 4489 | bool IsCFICall = IsIndirectCall && CLI.CFIType; | |||
| 4490 | const Module *M = MF.getMMI().getModule(); | |||
| 4491 | Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); | |||
| 4492 | ||||
| 4493 | MachineFunction::CallSiteInfo CSInfo; | |||
| 4494 | if (CallConv == CallingConv::X86_INTR) | |||
| 4495 | report_fatal_error("X86 interrupts may not be called directly"); | |||
| 4496 | ||||
| 4497 | bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall(); | |||
| 4498 | if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO && !IsMustTail) { | |||
| 4499 | // If we are using a GOT, disable tail calls to external symbols with | |||
| 4500 | // default visibility. Tail calling such a symbol requires using a GOT | |||
| 4501 | // relocation, which forces early binding of the symbol. This breaks code | |||
| 4502 | // that require lazy function symbol resolution. Using musttail or | |||
| 4503 | // GuaranteedTailCallOpt will override this. | |||
| 4504 | GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); | |||
| 4505 | if (!G || (!G->getGlobal()->hasLocalLinkage() && | |||
| 4506 | G->getGlobal()->hasDefaultVisibility())) | |||
| 4507 | isTailCall = false; | |||
| 4508 | } | |||
| 4509 | ||||
| 4510 | if (isTailCall && !IsMustTail) { | |||
| 4511 | // Check if it's really possible to do a tail call. | |||
| 4512 | isTailCall = IsEligibleForTailCallOptimization( | |||
| 4513 | Callee, CallConv, IsCalleePopSRet, isVarArg, CLI.RetTy, Outs, OutVals, | |||
| 4514 | Ins, DAG); | |||
| 4515 | ||||
| 4516 | // Sibcalls are automatically detected tailcalls which do not require | |||
| 4517 | // ABI changes. | |||
| 4518 | if (!IsGuaranteeTCO && isTailCall) | |||
| 4519 | IsSibcall = true; | |||
| 4520 | ||||
| 4521 | if (isTailCall) | |||
| 4522 | ++NumTailCalls; | |||
| 4523 | } | |||
| 4524 | ||||
| 4525 | if (IsMustTail && !isTailCall) | |||
| 4526 | report_fatal_error("failed to perform tail call elimination on a call " | |||
| 4527 | "site marked musttail"); | |||
| 4528 | ||||
| 4529 | assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&(static_cast <bool> (!(isVarArg && canGuaranteeTCO (CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe" ) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__ __PRETTY_FUNCTION__)) | |||
| 4530 | "Var args not supported with calling convention fastcc, ghc or hipe")(static_cast <bool> (!(isVarArg && canGuaranteeTCO (CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe" ) ? void (0) : __assert_fail ("!(isVarArg && canGuaranteeTCO(CallConv)) && \"Var args not supported with calling convention fastcc, ghc or hipe\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4530, __extension__ __PRETTY_FUNCTION__)); | |||
| 4531 | ||||
| 4532 | // Analyze operands of the call, assigning locations to each operand. | |||
| 4533 | SmallVector<CCValAssign, 16> ArgLocs; | |||
| 4534 | CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext()); | |||
| 4535 | ||||
| 4536 | // Allocate shadow area for Win64. | |||
| 4537 | if (IsWin64) | |||
| 4538 | CCInfo.AllocateStack(32, Align(8)); | |||
| 4539 | ||||
| 4540 | CCInfo.AnalyzeArguments(Outs, CC_X86); | |||
| 4541 | ||||
| 4542 | // In vectorcall calling convention a second pass is required for the HVA | |||
| 4543 | // types. | |||
| 4544 | if (CallingConv::X86_VectorCall == CallConv) { | |||
| 4545 | CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86); | |||
| 4546 | } | |||
| 4547 | ||||
| 4548 | // Get a count of how many bytes are to be pushed on the stack. | |||
| 4549 | unsigned NumBytes = CCInfo.getAlignedCallFrameSize(); | |||
| 4550 | if (IsSibcall) | |||
| 4551 | // This is a sibcall. The memory operands are available in caller's | |||
| 4552 | // own caller's stack. | |||
| 4553 | NumBytes = 0; | |||
| 4554 | else if (IsGuaranteeTCO && canGuaranteeTCO(CallConv)) | |||
| 4555 | NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); | |||
| 4556 | ||||
| 4557 | int FPDiff = 0; | |||
| 4558 | if (isTailCall && | |||
| 4559 | shouldGuaranteeTCO(CallConv, | |||
| 4560 | MF.getTarget().Options.GuaranteedTailCallOpt)) { | |||
| 4561 | // Lower arguments at fp - stackoffset + fpdiff. | |||
| 4562 | unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); | |||
| 4563 | ||||
| 4564 | FPDiff = NumBytesCallerPushed - NumBytes; | |||
| 4565 | ||||
| 4566 | // Set the delta of movement of the returnaddr stackslot. | |||
| 4567 | // But only set if delta is greater than previous delta. | |||
| 4568 | if (FPDiff < X86Info->getTCReturnAddrDelta()) | |||
| 4569 | X86Info->setTCReturnAddrDelta(FPDiff); | |||
| 4570 | } | |||
| 4571 | ||||
| 4572 | unsigned NumBytesToPush = NumBytes; | |||
| 4573 | unsigned NumBytesToPop = NumBytes; | |||
| 4574 | ||||
| 4575 | // If we have an inalloca argument, all stack space has already been allocated | |||
| 4576 | // for us and be right at the top of the stack. We don't support multiple | |||
| 4577 | // arguments passed in memory when using inalloca. | |||
| 4578 | if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { | |||
| 4579 | NumBytesToPush = 0; | |||
| 4580 | if (!ArgLocs.back().isMemLoc()) | |||
| 4581 | report_fatal_error("cannot use inalloca attribute on a register " | |||
| 4582 | "parameter"); | |||
| 4583 | if (ArgLocs.back().getLocMemOffset() != 0) | |||
| 4584 | report_fatal_error("any parameter with the inalloca attribute must be " | |||
| 4585 | "the only memory argument"); | |||
| 4586 | } else if (CLI.IsPreallocated) { | |||
| 4587 | assert(ArgLocs.back().isMemLoc() &&(static_cast <bool> (ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register " "parameter" ) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__ __PRETTY_FUNCTION__)) | |||
| 4588 | "cannot use preallocated attribute on a register "(static_cast <bool> (ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register " "parameter" ) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__ __PRETTY_FUNCTION__)) | |||
| 4589 | "parameter")(static_cast <bool> (ArgLocs.back().isMemLoc() && "cannot use preallocated attribute on a register " "parameter" ) ? void (0) : __assert_fail ("ArgLocs.back().isMemLoc() && \"cannot use preallocated attribute on a register \" \"parameter\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4589, __extension__ __PRETTY_FUNCTION__)); | |||
| 4590 | SmallVector<size_t, 4> PreallocatedOffsets; | |||
| 4591 | for (size_t i = 0; i < CLI.OutVals.size(); ++i) { | |||
| 4592 | if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) { | |||
| 4593 | PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset()); | |||
| 4594 | } | |||
| 4595 | } | |||
| 4596 | auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>(); | |||
| 4597 | size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB); | |||
| 4598 | MFI->setPreallocatedStackSize(PreallocatedId, NumBytes); | |||
| 4599 | MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets); | |||
| 4600 | NumBytesToPush = 0; | |||
| 4601 | } | |||
| 4602 | ||||
| 4603 | if (!IsSibcall && !IsMustTail) | |||
| 4604 | Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, | |||
| 4605 | NumBytes - NumBytesToPush, dl); | |||
| 4606 | ||||
| 4607 | SDValue RetAddrFrIdx; | |||
| 4608 | // Load return address for tail calls. | |||
| 4609 | if (isTailCall && FPDiff) | |||
| 4610 | Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, | |||
| 4611 | Is64Bit, FPDiff, dl); | |||
| 4612 | ||||
| 4613 | SmallVector<std::pair<Register, SDValue>, 8> RegsToPass; | |||
| 4614 | SmallVector<SDValue, 8> MemOpChains; | |||
| 4615 | SDValue StackPtr; | |||
| 4616 | ||||
| 4617 | // The next loop assumes that the locations are in the same order of the | |||
| 4618 | // input arguments. | |||
| 4619 | assert(isSortedByValueNo(ArgLocs) &&(static_cast <bool> (isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering") ? void (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__ __PRETTY_FUNCTION__)) | |||
| 4620 | "Argument Location list must be sorted before lowering")(static_cast <bool> (isSortedByValueNo(ArgLocs) && "Argument Location list must be sorted before lowering") ? void (0) : __assert_fail ("isSortedByValueNo(ArgLocs) && \"Argument Location list must be sorted before lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4620, __extension__ __PRETTY_FUNCTION__)); | |||
| 4621 | ||||
| 4622 | // Walk the register/memloc assignments, inserting copies/loads. In the case | |||
| 4623 | // of tail call optimization arguments are handle later. | |||
| 4624 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 4625 | for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E; | |||
| 4626 | ++I, ++OutIndex) { | |||
| 4627 | assert(OutIndex < Outs.size() && "Invalid Out index")(static_cast <bool> (OutIndex < Outs.size() && "Invalid Out index") ? void (0) : __assert_fail ("OutIndex < Outs.size() && \"Invalid Out index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4627, __extension__ __PRETTY_FUNCTION__)); | |||
| 4628 | // Skip inalloca/preallocated arguments, they have already been written. | |||
| 4629 | ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags; | |||
| 4630 | if (Flags.isInAlloca() || Flags.isPreallocated()) | |||
| 4631 | continue; | |||
| 4632 | ||||
| 4633 | CCValAssign &VA = ArgLocs[I]; | |||
| 4634 | EVT RegVT = VA.getLocVT(); | |||
| 4635 | SDValue Arg = OutVals[OutIndex]; | |||
| 4636 | bool isByVal = Flags.isByVal(); | |||
| 4637 | ||||
| 4638 | // Promote the value if needed. | |||
| 4639 | switch (VA.getLocInfo()) { | |||
| 4640 | default: llvm_unreachable("Unknown loc info!")::llvm::llvm_unreachable_internal("Unknown loc info!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 4640); | |||
| 4641 | case CCValAssign::Full: break; | |||
| 4642 | case CCValAssign::SExt: | |||
| 4643 | Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); | |||
| 4644 | break; | |||
| 4645 | case CCValAssign::ZExt: | |||
| 4646 | Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); | |||
| 4647 | break; | |||
| 4648 | case CCValAssign::AExt: | |||
| 4649 | if (Arg.getValueType().isVector() && | |||
| 4650 | Arg.getValueType().getVectorElementType() == MVT::i1) | |||
| 4651 | Arg = lowerMasksToReg(Arg, RegVT, dl, DAG); | |||
| 4652 | else if (RegVT.is128BitVector()) { | |||
| 4653 | // Special case: passing MMX values in XMM registers. | |||
| 4654 | Arg = DAG.getBitcast(MVT::i64, Arg); | |||
| 4655 | Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); | |||
| 4656 | Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); | |||
| 4657 | } else | |||
| 4658 | Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); | |||
| 4659 | break; | |||
| 4660 | case CCValAssign::BCvt: | |||
| 4661 | Arg = DAG.getBitcast(RegVT, Arg); | |||
| 4662 | break; | |||
| 4663 | case CCValAssign::Indirect: { | |||
| 4664 | if (isByVal) { | |||
| 4665 | // Memcpy the argument to a temporary stack slot to prevent | |||
| 4666 | // the caller from seeing any modifications the callee may make | |||
| 4667 | // as guaranteed by the `byval` attribute. | |||
| 4668 | int FrameIdx = MF.getFrameInfo().CreateStackObject( | |||
| 4669 | Flags.getByValSize(), | |||
| 4670 | std::max(Align(16), Flags.getNonZeroByValAlign()), false); | |||
| 4671 | SDValue StackSlot = | |||
| 4672 | DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout())); | |||
| 4673 | Chain = | |||
| 4674 | CreateCopyOfByValArgument(Arg, StackSlot, Chain, Flags, DAG, dl); | |||
| 4675 | // From now on treat this as a regular pointer | |||
| 4676 | Arg = StackSlot; | |||
| 4677 | isByVal = false; | |||
| 4678 | } else { | |||
| 4679 | // Store the argument. | |||
| 4680 | SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); | |||
| 4681 | int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex(); | |||
| 4682 | Chain = DAG.getStore( | |||
| 4683 | Chain, dl, Arg, SpillSlot, | |||
| 4684 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)); | |||
| 4685 | Arg = SpillSlot; | |||
| 4686 | } | |||
| 4687 | break; | |||
| 4688 | } | |||
| 4689 | } | |||
| 4690 | ||||
| 4691 | if (VA.needsCustom()) { | |||
| 4692 | assert(VA.getValVT() == MVT::v64i1 &&(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__ __PRETTY_FUNCTION__)) | |||
| 4693 | "Currently the only custom case is when we split v64i1 to 2 regs")(static_cast <bool> (VA.getValVT() == MVT::v64i1 && "Currently the only custom case is when we split v64i1 to 2 regs" ) ? void (0) : __assert_fail ("VA.getValVT() == MVT::v64i1 && \"Currently the only custom case is when we split v64i1 to 2 regs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4693, __extension__ __PRETTY_FUNCTION__)); | |||
| 4694 | // Split v64i1 value into two registers | |||
| 4695 | Passv64i1ArgInRegs(dl, DAG, Arg, RegsToPass, VA, ArgLocs[++I], Subtarget); | |||
| 4696 | } else if (VA.isRegLoc()) { | |||
| 4697 | RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); | |||
| 4698 | const TargetOptions &Options = DAG.getTarget().Options; | |||
| 4699 | if (Options.EmitCallSiteInfo) | |||
| 4700 | CSInfo.emplace_back(VA.getLocReg(), I); | |||
| 4701 | if (isVarArg && IsWin64) { | |||
| 4702 | // Win64 ABI requires argument XMM reg to be copied to the corresponding | |||
| 4703 | // shadow reg if callee is a varargs function. | |||
| 4704 | Register ShadowReg; | |||
| 4705 | switch (VA.getLocReg()) { | |||
| 4706 | case X86::XMM0: ShadowReg = X86::RCX; break; | |||
| 4707 | case X86::XMM1: ShadowReg = X86::RDX; break; | |||
| 4708 | case X86::XMM2: ShadowReg = X86::R8; break; | |||
| 4709 | case X86::XMM3: ShadowReg = X86::R9; break; | |||
| 4710 | } | |||
| 4711 | if (ShadowReg) | |||
| 4712 | RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); | |||
| 4713 | } | |||
| 4714 | } else if (!IsSibcall && (!isTailCall || isByVal)) { | |||
| 4715 | assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail ("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4715, __extension__ __PRETTY_FUNCTION__)); | |||
| 4716 | if (!StackPtr.getNode()) | |||
| 4717 | StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), | |||
| 4718 | getPointerTy(DAG.getDataLayout())); | |||
| 4719 | MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, | |||
| 4720 | dl, DAG, VA, Flags, isByVal)); | |||
| 4721 | } | |||
| 4722 | } | |||
| 4723 | ||||
| 4724 | if (!MemOpChains.empty()) | |||
| 4725 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); | |||
| 4726 | ||||
| 4727 | if (Subtarget.isPICStyleGOT()) { | |||
| 4728 | // ELF / PIC requires GOT in the EBX register before function calls via PLT | |||
| 4729 | // GOT pointer (except regcall). | |||
| 4730 | if (!isTailCall) { | |||
| 4731 | // Indirect call with RegCall calling convertion may use up all the | |||
| 4732 | // general registers, so it is not suitable to bind EBX reister for | |||
| 4733 | // GOT address, just let register allocator handle it. | |||
| 4734 | if (CallConv != CallingConv::X86_RegCall) | |||
| 4735 | RegsToPass.push_back(std::make_pair( | |||
| 4736 | Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), | |||
| 4737 | getPointerTy(DAG.getDataLayout())))); | |||
| 4738 | } else { | |||
| 4739 | // If we are tail calling and generating PIC/GOT style code load the | |||
| 4740 | // address of the callee into ECX. The value in ecx is used as target of | |||
| 4741 | // the tail jump. This is done to circumvent the ebx/callee-saved problem | |||
| 4742 | // for tail calls on PIC/GOT architectures. Normally we would just put the | |||
| 4743 | // address of GOT into ebx and then call target@PLT. But for tail calls | |||
| 4744 | // ebx would be restored (since ebx is callee saved) before jumping to the | |||
| 4745 | // target@PLT. | |||
| 4746 | ||||
| 4747 | // Note: The actual moving to ECX is done further down. | |||
| 4748 | GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); | |||
| 4749 | if (G && !G->getGlobal()->hasLocalLinkage() && | |||
| 4750 | G->getGlobal()->hasDefaultVisibility()) | |||
| 4751 | Callee = LowerGlobalAddress(Callee, DAG); | |||
| 4752 | else if (isa<ExternalSymbolSDNode>(Callee)) | |||
| 4753 | Callee = LowerExternalSymbol(Callee, DAG); | |||
| 4754 | } | |||
| 4755 | } | |||
| 4756 | ||||
| 4757 | if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail && | |||
| 4758 | (Subtarget.hasSSE1() || !M->getModuleFlag("SkipRaxSetup"))) { | |||
| 4759 | // From AMD64 ABI document: | |||
| 4760 | // For calls that may call functions that use varargs or stdargs | |||
| 4761 | // (prototype-less calls or calls to functions containing ellipsis (...) in | |||
| 4762 | // the declaration) %al is used as hidden argument to specify the number | |||
| 4763 | // of SSE registers used. The contents of %al do not need to match exactly | |||
| 4764 | // the number of registers, but must be an ubound on the number of SSE | |||
| 4765 | // registers used and is in the range 0 - 8 inclusive. | |||
| 4766 | ||||
| 4767 | // Count the number of XMM registers allocated. | |||
| 4768 | static const MCPhysReg XMMArgRegs[] = { | |||
| 4769 | X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, | |||
| 4770 | X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 | |||
| 4771 | }; | |||
| 4772 | unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs); | |||
| 4773 | assert((Subtarget.hasSSE1() || !NumXMMRegs)(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs ) && "SSE registers cannot be used when SSE is disabled" ) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__ __PRETTY_FUNCTION__)) | |||
| 4774 | && "SSE registers cannot be used when SSE is disabled")(static_cast <bool> ((Subtarget.hasSSE1() || !NumXMMRegs ) && "SSE registers cannot be used when SSE is disabled" ) ? void (0) : __assert_fail ("(Subtarget.hasSSE1() || !NumXMMRegs) && \"SSE registers cannot be used when SSE is disabled\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4774, __extension__ __PRETTY_FUNCTION__)); | |||
| 4775 | RegsToPass.push_back(std::make_pair(Register(X86::AL), | |||
| 4776 | DAG.getConstant(NumXMMRegs, dl, | |||
| 4777 | MVT::i8))); | |||
| 4778 | } | |||
| 4779 | ||||
| 4780 | if (isVarArg && IsMustTail) { | |||
| 4781 | const auto &Forwards = X86Info->getForwardedMustTailRegParms(); | |||
| 4782 | for (const auto &F : Forwards) { | |||
| 4783 | SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT); | |||
| 4784 | RegsToPass.push_back(std::make_pair(F.PReg, Val)); | |||
| 4785 | } | |||
| 4786 | } | |||
| 4787 | ||||
| 4788 | // For tail calls lower the arguments to the 'real' stack slots. Sibcalls | |||
| 4789 | // don't need this because the eligibility check rejects calls that require | |||
| 4790 | // shuffling arguments passed in memory. | |||
| 4791 | if (!IsSibcall && isTailCall) { | |||
| 4792 | // Force all the incoming stack arguments to be loaded from the stack | |||
| 4793 | // before any new outgoing arguments are stored to the stack, because the | |||
| 4794 | // outgoing stack slots may alias the incoming argument stack slots, and | |||
| 4795 | // the alias isn't otherwise explicit. This is slightly more conservative | |||
| 4796 | // than necessary, because it means that each store effectively depends | |||
| 4797 | // on every argument instead of just those arguments it would clobber. | |||
| 4798 | SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); | |||
| 4799 | ||||
| 4800 | SmallVector<SDValue, 8> MemOpChains2; | |||
| 4801 | SDValue FIN; | |||
| 4802 | int FI = 0; | |||
| 4803 | for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E; | |||
| 4804 | ++I, ++OutsIndex) { | |||
| 4805 | CCValAssign &VA = ArgLocs[I]; | |||
| 4806 | ||||
| 4807 | if (VA.isRegLoc()) { | |||
| 4808 | if (VA.needsCustom()) { | |||
| 4809 | assert((CallConv == CallingConv::X86_RegCall) &&(static_cast <bool> ((CallConv == CallingConv::X86_RegCall ) && "Expecting custom case only in regcall calling convention" ) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__ __PRETTY_FUNCTION__)) | |||
| 4810 | "Expecting custom case only in regcall calling convention")(static_cast <bool> ((CallConv == CallingConv::X86_RegCall ) && "Expecting custom case only in regcall calling convention" ) ? void (0) : __assert_fail ("(CallConv == CallingConv::X86_RegCall) && \"Expecting custom case only in regcall calling convention\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4810, __extension__ __PRETTY_FUNCTION__)); | |||
| 4811 | // This means that we are in special case where one argument was | |||
| 4812 | // passed through two register locations - Skip the next location | |||
| 4813 | ++I; | |||
| 4814 | } | |||
| 4815 | ||||
| 4816 | continue; | |||
| 4817 | } | |||
| 4818 | ||||
| 4819 | assert(VA.isMemLoc())(static_cast <bool> (VA.isMemLoc()) ? void (0) : __assert_fail ("VA.isMemLoc()", "llvm/lib/Target/X86/X86ISelLowering.cpp", 4819, __extension__ __PRETTY_FUNCTION__)); | |||
| 4820 | SDValue Arg = OutVals[OutsIndex]; | |||
| 4821 | ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags; | |||
| 4822 | // Skip inalloca/preallocated arguments. They don't require any work. | |||
| 4823 | if (Flags.isInAlloca() || Flags.isPreallocated()) | |||
| 4824 | continue; | |||
| 4825 | // Create frame index. | |||
| 4826 | int32_t Offset = VA.getLocMemOffset()+FPDiff; | |||
| 4827 | uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; | |||
| 4828 | FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true); | |||
| 4829 | FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout())); | |||
| 4830 | ||||
| 4831 | if (Flags.isByVal()) { | |||
| 4832 | // Copy relative to framepointer. | |||
| 4833 | SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl); | |||
| 4834 | if (!StackPtr.getNode()) | |||
| 4835 | StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), | |||
| 4836 | getPointerTy(DAG.getDataLayout())); | |||
| 4837 | Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()), | |||
| 4838 | StackPtr, Source); | |||
| 4839 | ||||
| 4840 | MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, | |||
| 4841 | ArgChain, | |||
| 4842 | Flags, DAG, dl)); | |||
| 4843 | } else { | |||
| 4844 | // Store relative to framepointer. | |||
| 4845 | MemOpChains2.push_back(DAG.getStore( | |||
| 4846 | ArgChain, dl, Arg, FIN, | |||
| 4847 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI))); | |||
| 4848 | } | |||
| 4849 | } | |||
| 4850 | ||||
| 4851 | if (!MemOpChains2.empty()) | |||
| 4852 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); | |||
| 4853 | ||||
| 4854 | // Store the return address to the appropriate stack slot. | |||
| 4855 | Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, | |||
| 4856 | getPointerTy(DAG.getDataLayout()), | |||
| 4857 | RegInfo->getSlotSize(), FPDiff, dl); | |||
| 4858 | } | |||
| 4859 | ||||
| 4860 | // Build a sequence of copy-to-reg nodes chained together with token chain | |||
| 4861 | // and glue operands which copy the outgoing args into registers. | |||
| 4862 | SDValue InGlue; | |||
| 4863 | for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { | |||
| 4864 | Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, | |||
| 4865 | RegsToPass[i].second, InGlue); | |||
| 4866 | InGlue = Chain.getValue(1); | |||
| 4867 | } | |||
| 4868 | ||||
| 4869 | if (DAG.getTarget().getCodeModel() == CodeModel::Large) { | |||
| 4870 | assert(Is64Bit && "Large code model is only legal in 64-bit mode.")(static_cast <bool> (Is64Bit && "Large code model is only legal in 64-bit mode." ) ? void (0) : __assert_fail ("Is64Bit && \"Large code model is only legal in 64-bit mode.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4870, __extension__ __PRETTY_FUNCTION__)); | |||
| 4871 | // In the 64-bit large code model, we have to make all calls | |||
| 4872 | // through a register, since the call instruction's 32-bit | |||
| 4873 | // pc-relative offset may not be large enough to hold the whole | |||
| 4874 | // address. | |||
| 4875 | } else if (Callee->getOpcode() == ISD::GlobalAddress || | |||
| 4876 | Callee->getOpcode() == ISD::ExternalSymbol) { | |||
| 4877 | // Lower direct calls to global addresses and external symbols. Setting | |||
| 4878 | // ForCall to true here has the effect of removing WrapperRIP when possible | |||
| 4879 | // to allow direct calls to be selected without first materializing the | |||
| 4880 | // address into a register. | |||
| 4881 | Callee = LowerGlobalOrExternal(Callee, DAG, /*ForCall=*/true); | |||
| 4882 | } else if (Subtarget.isTarget64BitILP32() && | |||
| 4883 | Callee.getValueType() == MVT::i32) { | |||
| 4884 | // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI | |||
| 4885 | Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee); | |||
| 4886 | } | |||
| 4887 | ||||
| 4888 | // Returns a chain & a glue for retval copy to use. | |||
| 4889 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 4890 | SmallVector<SDValue, 8> Ops; | |||
| 4891 | ||||
| 4892 | if (!IsSibcall && isTailCall && !IsMustTail) { | |||
| 4893 | Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, 0, InGlue, dl); | |||
| 4894 | InGlue = Chain.getValue(1); | |||
| 4895 | } | |||
| 4896 | ||||
| 4897 | Ops.push_back(Chain); | |||
| 4898 | Ops.push_back(Callee); | |||
| 4899 | ||||
| 4900 | if (isTailCall) | |||
| 4901 | Ops.push_back(DAG.getTargetConstant(FPDiff, dl, MVT::i32)); | |||
| 4902 | ||||
| 4903 | // Add argument registers to the end of the list so that they are known live | |||
| 4904 | // into the call. | |||
| 4905 | for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) | |||
| 4906 | Ops.push_back(DAG.getRegister(RegsToPass[i].first, | |||
| 4907 | RegsToPass[i].second.getValueType())); | |||
| 4908 | ||||
| 4909 | // Add a register mask operand representing the call-preserved registers. | |||
| 4910 | const uint32_t *Mask = [&]() { | |||
| 4911 | auto AdaptedCC = CallConv; | |||
| 4912 | // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists), | |||
| 4913 | // use X86_INTR calling convention because it has the same CSR mask | |||
| 4914 | // (same preserved registers). | |||
| 4915 | if (HasNCSR) | |||
| 4916 | AdaptedCC = (CallingConv::ID)CallingConv::X86_INTR; | |||
| 4917 | // If NoCalleeSavedRegisters is requested, than use GHC since it happens | |||
| 4918 | // to use the CSR_NoRegs_RegMask. | |||
| 4919 | if (CB && CB->hasFnAttr("no_callee_saved_registers")) | |||
| 4920 | AdaptedCC = (CallingConv::ID)CallingConv::GHC; | |||
| 4921 | return RegInfo->getCallPreservedMask(MF, AdaptedCC); | |||
| 4922 | }(); | |||
| 4923 | assert(Mask && "Missing call preserved mask for calling convention")(static_cast <bool> (Mask && "Missing call preserved mask for calling convention" ) ? void (0) : __assert_fail ("Mask && \"Missing call preserved mask for calling convention\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4923, __extension__ __PRETTY_FUNCTION__)); | |||
| 4924 | ||||
| 4925 | // If this is an invoke in a 32-bit function using a funclet-based | |||
| 4926 | // personality, assume the function clobbers all registers. If an exception | |||
| 4927 | // is thrown, the runtime will not restore CSRs. | |||
| 4928 | // FIXME: Model this more precisely so that we can register allocate across | |||
| 4929 | // the normal edge and spill and fill across the exceptional edge. | |||
| 4930 | if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) { | |||
| 4931 | const Function &CallerFn = MF.getFunction(); | |||
| 4932 | EHPersonality Pers = | |||
| 4933 | CallerFn.hasPersonalityFn() | |||
| 4934 | ? classifyEHPersonality(CallerFn.getPersonalityFn()) | |||
| 4935 | : EHPersonality::Unknown; | |||
| 4936 | if (isFuncletEHPersonality(Pers)) | |||
| 4937 | Mask = RegInfo->getNoPreservedMask(); | |||
| 4938 | } | |||
| 4939 | ||||
| 4940 | // Define a new register mask from the existing mask. | |||
| 4941 | uint32_t *RegMask = nullptr; | |||
| 4942 | ||||
| 4943 | // In some calling conventions we need to remove the used physical registers | |||
| 4944 | // from the reg mask. Create a new RegMask for such calling conventions. | |||
| 4945 | // RegMask for calling conventions that disable only return registers (e.g. | |||
| 4946 | // preserve_most) will be modified later in LowerCallResult. | |||
| 4947 | bool ShouldDisableArgRegs = shouldDisableArgRegFromCSR(CallConv) || HasNCSR; | |||
| 4948 | if (ShouldDisableArgRegs || shouldDisableRetRegFromCSR(CallConv)) { | |||
| 4949 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 4950 | ||||
| 4951 | // Allocate a new Reg Mask and copy Mask. | |||
| 4952 | RegMask = MF.allocateRegMask(); | |||
| 4953 | unsigned RegMaskSize = MachineOperand::getRegMaskSize(TRI->getNumRegs()); | |||
| 4954 | memcpy(RegMask, Mask, sizeof(RegMask[0]) * RegMaskSize); | |||
| 4955 | ||||
| 4956 | // Make sure all sub registers of the argument registers are reset | |||
| 4957 | // in the RegMask. | |||
| 4958 | if (ShouldDisableArgRegs) { | |||
| 4959 | for (auto const &RegPair : RegsToPass) | |||
| 4960 | for (MCPhysReg SubReg : TRI->subregs_inclusive(RegPair.first)) | |||
| 4961 | RegMask[SubReg / 32] &= ~(1u << (SubReg % 32)); | |||
| 4962 | } | |||
| 4963 | ||||
| 4964 | // Create the RegMask Operand according to our updated mask. | |||
| 4965 | Ops.push_back(DAG.getRegisterMask(RegMask)); | |||
| 4966 | } else { | |||
| 4967 | // Create the RegMask Operand according to the static mask. | |||
| 4968 | Ops.push_back(DAG.getRegisterMask(Mask)); | |||
| 4969 | } | |||
| 4970 | ||||
| 4971 | if (InGlue.getNode()) | |||
| 4972 | Ops.push_back(InGlue); | |||
| 4973 | ||||
| 4974 | if (isTailCall) { | |||
| 4975 | // We used to do: | |||
| 4976 | //// If this is the first return lowered for this function, add the regs | |||
| 4977 | //// to the liveout set for the function. | |||
| 4978 | // This isn't right, although it's probably harmless on x86; liveouts | |||
| 4979 | // should be computed from returns not tail calls. Consider a void | |||
| 4980 | // function making a tail call to a function returning int. | |||
| 4981 | MF.getFrameInfo().setHasTailCall(); | |||
| 4982 | SDValue Ret = DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); | |||
| 4983 | ||||
| 4984 | if (IsCFICall) | |||
| 4985 | Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue()); | |||
| 4986 | ||||
| 4987 | DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo)); | |||
| 4988 | return Ret; | |||
| 4989 | } | |||
| 4990 | ||||
| 4991 | if (HasNoCfCheck && IsCFProtectionSupported && IsIndirectCall) { | |||
| 4992 | Chain = DAG.getNode(X86ISD::NT_CALL, dl, NodeTys, Ops); | |||
| 4993 | } else if (CLI.CB && objcarc::hasAttachedCallOpBundle(CLI.CB)) { | |||
| 4994 | // Calls with a "clang.arc.attachedcall" bundle are special. They should be | |||
| 4995 | // expanded to the call, directly followed by a special marker sequence and | |||
| 4996 | // a call to a ObjC library function. Use the CALL_RVMARKER to do that. | |||
| 4997 | assert(!isTailCall &&(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall" ) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__ __PRETTY_FUNCTION__)) | |||
| 4998 | "tail calls cannot be marked with clang.arc.attachedcall")(static_cast <bool> (!isTailCall && "tail calls cannot be marked with clang.arc.attachedcall" ) ? void (0) : __assert_fail ("!isTailCall && \"tail calls cannot be marked with clang.arc.attachedcall\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4998, __extension__ __PRETTY_FUNCTION__)); | |||
| 4999 | assert(Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode")(static_cast <bool> (Is64Bit && "clang.arc.attachedcall is only supported in 64bit mode" ) ? void (0) : __assert_fail ("Is64Bit && \"clang.arc.attachedcall is only supported in 64bit mode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 4999, __extension__ __PRETTY_FUNCTION__)); | |||
| 5000 | ||||
| 5001 | // Add a target global address for the retainRV/claimRV runtime function | |||
| 5002 | // just before the call target. | |||
| 5003 | Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB); | |||
| 5004 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 5005 | auto GA = DAG.getTargetGlobalAddress(ARCFn, dl, PtrVT); | |||
| 5006 | Ops.insert(Ops.begin() + 1, GA); | |||
| 5007 | Chain = DAG.getNode(X86ISD::CALL_RVMARKER, dl, NodeTys, Ops); | |||
| 5008 | } else { | |||
| 5009 | Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); | |||
| 5010 | } | |||
| 5011 | ||||
| 5012 | if (IsCFICall) | |||
| 5013 | Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue()); | |||
| 5014 | ||||
| 5015 | InGlue = Chain.getValue(1); | |||
| 5016 | DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge); | |||
| 5017 | DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo)); | |||
| 5018 | ||||
| 5019 | // Save heapallocsite metadata. | |||
| 5020 | if (CLI.CB) | |||
| 5021 | if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite")) | |||
| 5022 | DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc); | |||
| 5023 | ||||
| 5024 | // Create the CALLSEQ_END node. | |||
| 5025 | unsigned NumBytesForCalleeToPop = 0; // Callee pops nothing. | |||
| 5026 | if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, | |||
| 5027 | DAG.getTarget().Options.GuaranteedTailCallOpt)) | |||
| 5028 | NumBytesForCalleeToPop = NumBytes; // Callee pops everything | |||
| 5029 | else if (!canGuaranteeTCO(CallConv) && IsCalleePopSRet) | |||
| 5030 | // If this call passes a struct-return pointer, the callee | |||
| 5031 | // pops that struct pointer. | |||
| 5032 | NumBytesForCalleeToPop = 4; | |||
| 5033 | ||||
| 5034 | // Returns a glue for retval copy to use. | |||
| 5035 | if (!IsSibcall) { | |||
| 5036 | Chain = DAG.getCALLSEQ_END(Chain, NumBytesToPop, NumBytesForCalleeToPop, | |||
| 5037 | InGlue, dl); | |||
| 5038 | InGlue = Chain.getValue(1); | |||
| 5039 | } | |||
| 5040 | ||||
| 5041 | // Handle result values, copying them out of physregs into vregs that we | |||
| 5042 | // return. | |||
| 5043 | return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG, | |||
| 5044 | InVals, RegMask); | |||
| 5045 | } | |||
| 5046 | ||||
| 5047 | //===----------------------------------------------------------------------===// | |||
| 5048 | // Fast Calling Convention (tail call) implementation | |||
| 5049 | //===----------------------------------------------------------------------===// | |||
| 5050 | ||||
| 5051 | // Like std call, callee cleans arguments, convention except that ECX is | |||
| 5052 | // reserved for storing the tail called function address. Only 2 registers are | |||
| 5053 | // free for argument passing (inreg). Tail call optimization is performed | |||
| 5054 | // provided: | |||
| 5055 | // * tailcallopt is enabled | |||
| 5056 | // * caller/callee are fastcc | |||
| 5057 | // On X86_64 architecture with GOT-style position independent code only local | |||
| 5058 | // (within module) calls are supported at the moment. | |||
| 5059 | // To keep the stack aligned according to platform abi the function | |||
| 5060 | // GetAlignedArgumentStackSize ensures that argument delta is always multiples | |||
| 5061 | // of stack alignment. (Dynamic linkers need this - Darwin's dyld for example) | |||
| 5062 | // If a tail called function callee has more arguments than the caller the | |||
| 5063 | // caller needs to make sure that there is room to move the RETADDR to. This is | |||
| 5064 | // achieved by reserving an area the size of the argument delta right after the | |||
| 5065 | // original RETADDR, but before the saved framepointer or the spilled registers | |||
| 5066 | // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) | |||
| 5067 | // stack layout: | |||
| 5068 | // arg1 | |||
| 5069 | // arg2 | |||
| 5070 | // RETADDR | |||
| 5071 | // [ new RETADDR | |||
| 5072 | // move area ] | |||
| 5073 | // (possible EBP) | |||
| 5074 | // ESI | |||
| 5075 | // EDI | |||
| 5076 | // local1 .. | |||
| 5077 | ||||
| 5078 | /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align | |||
| 5079 | /// requirement. | |||
| 5080 | unsigned | |||
| 5081 | X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize, | |||
| 5082 | SelectionDAG &DAG) const { | |||
| 5083 | const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign(); | |||
| 5084 | const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize(); | |||
| 5085 | assert(StackSize % SlotSize == 0 &&(static_cast <bool> (StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__ __PRETTY_FUNCTION__)) | |||
| 5086 | "StackSize must be a multiple of SlotSize")(static_cast <bool> (StackSize % SlotSize == 0 && "StackSize must be a multiple of SlotSize") ? void (0) : __assert_fail ("StackSize % SlotSize == 0 && \"StackSize must be a multiple of SlotSize\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5086, __extension__ __PRETTY_FUNCTION__)); | |||
| 5087 | return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize; | |||
| 5088 | } | |||
| 5089 | ||||
| 5090 | /// Return true if the given stack call argument is already available in the | |||
| 5091 | /// same position (relatively) of the caller's incoming argument stack. | |||
| 5092 | static | |||
| 5093 | bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, | |||
| 5094 | MachineFrameInfo &MFI, const MachineRegisterInfo *MRI, | |||
| 5095 | const X86InstrInfo *TII, const CCValAssign &VA) { | |||
| 5096 | unsigned Bytes = Arg.getValueSizeInBits() / 8; | |||
| 5097 | ||||
| 5098 | for (;;) { | |||
| 5099 | // Look through nodes that don't alter the bits of the incoming value. | |||
| 5100 | unsigned Op = Arg.getOpcode(); | |||
| 5101 | if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) { | |||
| 5102 | Arg = Arg.getOperand(0); | |||
| 5103 | continue; | |||
| 5104 | } | |||
| 5105 | if (Op == ISD::TRUNCATE) { | |||
| 5106 | const SDValue &TruncInput = Arg.getOperand(0); | |||
| 5107 | if (TruncInput.getOpcode() == ISD::AssertZext && | |||
| 5108 | cast<VTSDNode>(TruncInput.getOperand(1))->getVT() == | |||
| 5109 | Arg.getValueType()) { | |||
| 5110 | Arg = TruncInput.getOperand(0); | |||
| 5111 | continue; | |||
| 5112 | } | |||
| 5113 | } | |||
| 5114 | break; | |||
| 5115 | } | |||
| 5116 | ||||
| 5117 | int FI = INT_MAX2147483647; | |||
| 5118 | if (Arg.getOpcode() == ISD::CopyFromReg) { | |||
| 5119 | Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg(); | |||
| 5120 | if (!VR.isVirtual()) | |||
| 5121 | return false; | |||
| 5122 | MachineInstr *Def = MRI->getVRegDef(VR); | |||
| 5123 | if (!Def) | |||
| 5124 | return false; | |||
| 5125 | if (!Flags.isByVal()) { | |||
| 5126 | if (!TII->isLoadFromStackSlot(*Def, FI)) | |||
| 5127 | return false; | |||
| 5128 | } else { | |||
| 5129 | unsigned Opcode = Def->getOpcode(); | |||
| 5130 | if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r || | |||
| 5131 | Opcode == X86::LEA64_32r) && | |||
| 5132 | Def->getOperand(1).isFI()) { | |||
| 5133 | FI = Def->getOperand(1).getIndex(); | |||
| 5134 | Bytes = Flags.getByValSize(); | |||
| 5135 | } else | |||
| 5136 | return false; | |||
| 5137 | } | |||
| 5138 | } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) { | |||
| 5139 | if (Flags.isByVal()) | |||
| 5140 | // ByVal argument is passed in as a pointer but it's now being | |||
| 5141 | // dereferenced. e.g. | |||
| 5142 | // define @foo(%struct.X* %A) { | |||
| 5143 | // tail call @bar(%struct.X* byval %A) | |||
| 5144 | // } | |||
| 5145 | return false; | |||
| 5146 | SDValue Ptr = Ld->getBasePtr(); | |||
| 5147 | FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr); | |||
| 5148 | if (!FINode) | |||
| 5149 | return false; | |||
| 5150 | FI = FINode->getIndex(); | |||
| 5151 | } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { | |||
| 5152 | FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg); | |||
| 5153 | FI = FINode->getIndex(); | |||
| 5154 | Bytes = Flags.getByValSize(); | |||
| 5155 | } else | |||
| 5156 | return false; | |||
| 5157 | ||||
| 5158 | assert(FI != INT_MAX)(static_cast <bool> (FI != 2147483647) ? void (0) : __assert_fail ("FI != INT_MAX", "llvm/lib/Target/X86/X86ISelLowering.cpp", 5158, __extension__ __PRETTY_FUNCTION__)); | |||
| 5159 | if (!MFI.isFixedObjectIndex(FI)) | |||
| 5160 | return false; | |||
| 5161 | ||||
| 5162 | if (Offset != MFI.getObjectOffset(FI)) | |||
| 5163 | return false; | |||
| 5164 | ||||
| 5165 | // If this is not byval, check that the argument stack object is immutable. | |||
| 5166 | // inalloca and argument copy elision can create mutable argument stack | |||
| 5167 | // objects. Byval objects can be mutated, but a byval call intends to pass the | |||
| 5168 | // mutated memory. | |||
| 5169 | if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI)) | |||
| 5170 | return false; | |||
| 5171 | ||||
| 5172 | if (VA.getLocVT().getFixedSizeInBits() > | |||
| 5173 | Arg.getValueSizeInBits().getFixedValue()) { | |||
| 5174 | // If the argument location is wider than the argument type, check that any | |||
| 5175 | // extension flags match. | |||
| 5176 | if (Flags.isZExt() != MFI.isObjectZExt(FI) || | |||
| 5177 | Flags.isSExt() != MFI.isObjectSExt(FI)) { | |||
| 5178 | return false; | |||
| 5179 | } | |||
| 5180 | } | |||
| 5181 | ||||
| 5182 | return Bytes == MFI.getObjectSize(FI); | |||
| 5183 | } | |||
| 5184 | ||||
| 5185 | /// Check whether the call is eligible for tail call optimization. Targets | |||
| 5186 | /// that want to do tail call optimization should implement this function. | |||
| 5187 | bool X86TargetLowering::IsEligibleForTailCallOptimization( | |||
| 5188 | SDValue Callee, CallingConv::ID CalleeCC, bool IsCalleePopSRet, | |||
| 5189 | bool isVarArg, Type *RetTy, const SmallVectorImpl<ISD::OutputArg> &Outs, | |||
| 5190 | const SmallVectorImpl<SDValue> &OutVals, | |||
| 5191 | const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const { | |||
| 5192 | if (!mayTailCallThisCC(CalleeCC)) | |||
| 5193 | return false; | |||
| 5194 | ||||
| 5195 | // If -tailcallopt is specified, make fastcc functions tail-callable. | |||
| 5196 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 5197 | const Function &CallerF = MF.getFunction(); | |||
| 5198 | ||||
| 5199 | // If the function return type is x86_fp80 and the callee return type is not, | |||
| 5200 | // then the FP_EXTEND of the call result is not a nop. It's not safe to | |||
| 5201 | // perform a tailcall optimization here. | |||
| 5202 | if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) | |||
| 5203 | return false; | |||
| 5204 | ||||
| 5205 | CallingConv::ID CallerCC = CallerF.getCallingConv(); | |||
| 5206 | bool CCMatch = CallerCC == CalleeCC; | |||
| 5207 | bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); | |||
| 5208 | bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); | |||
| 5209 | bool IsGuaranteeTCO = DAG.getTarget().Options.GuaranteedTailCallOpt || | |||
| 5210 | CalleeCC == CallingConv::Tail || CalleeCC == CallingConv::SwiftTail; | |||
| 5211 | ||||
| 5212 | // Win64 functions have extra shadow space for argument homing. Don't do the | |||
| 5213 | // sibcall if the caller and callee have mismatched expectations for this | |||
| 5214 | // space. | |||
| 5215 | if (IsCalleeWin64 != IsCallerWin64) | |||
| 5216 | return false; | |||
| 5217 | ||||
| 5218 | if (IsGuaranteeTCO) { | |||
| 5219 | if (canGuaranteeTCO(CalleeCC) && CCMatch) | |||
| 5220 | return true; | |||
| 5221 | return false; | |||
| 5222 | } | |||
| 5223 | ||||
| 5224 | // Look for obvious safe cases to perform tail call optimization that do not | |||
| 5225 | // require ABI changes. This is what gcc calls sibcall. | |||
| 5226 | ||||
| 5227 | // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to | |||
| 5228 | // emit a special epilogue. | |||
| 5229 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 5230 | if (RegInfo->hasStackRealignment(MF)) | |||
| 5231 | return false; | |||
| 5232 | ||||
| 5233 | // Also avoid sibcall optimization if we're an sret return fn and the callee | |||
| 5234 | // is incompatible. See comment in LowerReturn about why hasStructRetAttr is | |||
| 5235 | // insufficient. | |||
| 5236 | if (MF.getInfo<X86MachineFunctionInfo>()->getSRetReturnReg()) { | |||
| 5237 | // For a compatible tail call the callee must return our sret pointer. So it | |||
| 5238 | // needs to be (a) an sret function itself and (b) we pass our sret as its | |||
| 5239 | // sret. Condition #b is harder to determine. | |||
| 5240 | return false; | |||
| 5241 | } else if (IsCalleePopSRet) | |||
| 5242 | // The callee pops an sret, so we cannot tail-call, as our caller doesn't | |||
| 5243 | // expect that. | |||
| 5244 | return false; | |||
| 5245 | ||||
| 5246 | // Do not sibcall optimize vararg calls unless all arguments are passed via | |||
| 5247 | // registers. | |||
| 5248 | LLVMContext &C = *DAG.getContext(); | |||
| 5249 | if (isVarArg && !Outs.empty()) { | |||
| 5250 | // Optimizing for varargs on Win64 is unlikely to be safe without | |||
| 5251 | // additional testing. | |||
| 5252 | if (IsCalleeWin64 || IsCallerWin64) | |||
| 5253 | return false; | |||
| 5254 | ||||
| 5255 | SmallVector<CCValAssign, 16> ArgLocs; | |||
| 5256 | CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); | |||
| 5257 | ||||
| 5258 | CCInfo.AnalyzeCallOperands(Outs, CC_X86); | |||
| 5259 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) | |||
| 5260 | if (!ArgLocs[i].isRegLoc()) | |||
| 5261 | return false; | |||
| 5262 | } | |||
| 5263 | ||||
| 5264 | // If the call result is in ST0 / ST1, it needs to be popped off the x87 | |||
| 5265 | // stack. Therefore, if it's not used by the call it is not safe to optimize | |||
| 5266 | // this into a sibcall. | |||
| 5267 | bool Unused = false; | |||
| 5268 | for (unsigned i = 0, e = Ins.size(); i != e; ++i) { | |||
| 5269 | if (!Ins[i].Used) { | |||
| 5270 | Unused = true; | |||
| 5271 | break; | |||
| 5272 | } | |||
| 5273 | } | |||
| 5274 | if (Unused) { | |||
| 5275 | SmallVector<CCValAssign, 16> RVLocs; | |||
| 5276 | CCState CCInfo(CalleeCC, false, MF, RVLocs, C); | |||
| 5277 | CCInfo.AnalyzeCallResult(Ins, RetCC_X86); | |||
| 5278 | for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { | |||
| 5279 | CCValAssign &VA = RVLocs[i]; | |||
| 5280 | if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) | |||
| 5281 | return false; | |||
| 5282 | } | |||
| 5283 | } | |||
| 5284 | ||||
| 5285 | // Check that the call results are passed in the same way. | |||
| 5286 | if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins, | |||
| 5287 | RetCC_X86, RetCC_X86)) | |||
| 5288 | return false; | |||
| 5289 | // The callee has to preserve all registers the caller needs to preserve. | |||
| 5290 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 5291 | const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC); | |||
| 5292 | if (!CCMatch) { | |||
| 5293 | const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC); | |||
| 5294 | if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) | |||
| 5295 | return false; | |||
| 5296 | } | |||
| 5297 | ||||
| 5298 | unsigned StackArgsSize = 0; | |||
| 5299 | ||||
| 5300 | // If the callee takes no arguments then go on to check the results of the | |||
| 5301 | // call. | |||
| 5302 | if (!Outs.empty()) { | |||
| 5303 | // Check if stack adjustment is needed. For now, do not do this if any | |||
| 5304 | // argument is passed on the stack. | |||
| 5305 | SmallVector<CCValAssign, 16> ArgLocs; | |||
| 5306 | CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C); | |||
| 5307 | ||||
| 5308 | // Allocate shadow area for Win64 | |||
| 5309 | if (IsCalleeWin64) | |||
| 5310 | CCInfo.AllocateStack(32, Align(8)); | |||
| 5311 | ||||
| 5312 | CCInfo.AnalyzeCallOperands(Outs, CC_X86); | |||
| 5313 | StackArgsSize = CCInfo.getNextStackOffset(); | |||
| 5314 | ||||
| 5315 | if (CCInfo.getNextStackOffset()) { | |||
| 5316 | // Check if the arguments are already laid out in the right way as | |||
| 5317 | // the caller's fixed stack objects. | |||
| 5318 | MachineFrameInfo &MFI = MF.getFrameInfo(); | |||
| 5319 | const MachineRegisterInfo *MRI = &MF.getRegInfo(); | |||
| 5320 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 5321 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { | |||
| 5322 | CCValAssign &VA = ArgLocs[i]; | |||
| 5323 | SDValue Arg = OutVals[i]; | |||
| 5324 | ISD::ArgFlagsTy Flags = Outs[i].Flags; | |||
| 5325 | if (VA.getLocInfo() == CCValAssign::Indirect) | |||
| 5326 | return false; | |||
| 5327 | if (!VA.isRegLoc()) { | |||
| 5328 | if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, | |||
| 5329 | MFI, MRI, TII, VA)) | |||
| 5330 | return false; | |||
| 5331 | } | |||
| 5332 | } | |||
| 5333 | } | |||
| 5334 | ||||
| 5335 | bool PositionIndependent = isPositionIndependent(); | |||
| 5336 | // If the tailcall address may be in a register, then make sure it's | |||
| 5337 | // possible to register allocate for it. In 32-bit, the call address can | |||
| 5338 | // only target EAX, EDX, or ECX since the tail call must be scheduled after | |||
| 5339 | // callee-saved registers are restored. These happen to be the same | |||
| 5340 | // registers used to pass 'inreg' arguments so watch out for those. | |||
| 5341 | if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) && | |||
| 5342 | !isa<ExternalSymbolSDNode>(Callee)) || | |||
| 5343 | PositionIndependent)) { | |||
| 5344 | unsigned NumInRegs = 0; | |||
| 5345 | // In PIC we need an extra register to formulate the address computation | |||
| 5346 | // for the callee. | |||
| 5347 | unsigned MaxInRegs = PositionIndependent ? 2 : 3; | |||
| 5348 | ||||
| 5349 | for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { | |||
| 5350 | CCValAssign &VA = ArgLocs[i]; | |||
| 5351 | if (!VA.isRegLoc()) | |||
| 5352 | continue; | |||
| 5353 | Register Reg = VA.getLocReg(); | |||
| 5354 | switch (Reg) { | |||
| 5355 | default: break; | |||
| 5356 | case X86::EAX: case X86::EDX: case X86::ECX: | |||
| 5357 | if (++NumInRegs == MaxInRegs) | |||
| 5358 | return false; | |||
| 5359 | break; | |||
| 5360 | } | |||
| 5361 | } | |||
| 5362 | } | |||
| 5363 | ||||
| 5364 | const MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
| 5365 | if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) | |||
| 5366 | return false; | |||
| 5367 | } | |||
| 5368 | ||||
| 5369 | bool CalleeWillPop = | |||
| 5370 | X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg, | |||
| 5371 | MF.getTarget().Options.GuaranteedTailCallOpt); | |||
| 5372 | ||||
| 5373 | if (unsigned BytesToPop = | |||
| 5374 | MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) { | |||
| 5375 | // If we have bytes to pop, the callee must pop them. | |||
| 5376 | bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize; | |||
| 5377 | if (!CalleePopMatches) | |||
| 5378 | return false; | |||
| 5379 | } else if (CalleeWillPop && StackArgsSize > 0) { | |||
| 5380 | // If we don't have bytes to pop, make sure the callee doesn't pop any. | |||
| 5381 | return false; | |||
| 5382 | } | |||
| 5383 | ||||
| 5384 | return true; | |||
| 5385 | } | |||
| 5386 | ||||
| 5387 | FastISel * | |||
| 5388 | X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, | |||
| 5389 | const TargetLibraryInfo *libInfo) const { | |||
| 5390 | return X86::createFastISel(funcInfo, libInfo); | |||
| 5391 | } | |||
| 5392 | ||||
| 5393 | //===----------------------------------------------------------------------===// | |||
| 5394 | // Other Lowering Hooks | |||
| 5395 | //===----------------------------------------------------------------------===// | |||
| 5396 | ||||
| 5397 | bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, | |||
| 5398 | bool AssumeSingleUse) { | |||
| 5399 | if (!AssumeSingleUse && !Op.hasOneUse()) | |||
| 5400 | return false; | |||
| 5401 | if (!ISD::isNormalLoad(Op.getNode())) | |||
| 5402 | return false; | |||
| 5403 | ||||
| 5404 | // If this is an unaligned vector, make sure the target supports folding it. | |||
| 5405 | auto *Ld = cast<LoadSDNode>(Op.getNode()); | |||
| 5406 | if (!Subtarget.hasAVX() && !Subtarget.hasSSEUnalignedMem() && | |||
| 5407 | Ld->getValueSizeInBits(0) == 128 && Ld->getAlign() < Align(16)) | |||
| 5408 | return false; | |||
| 5409 | ||||
| 5410 | // TODO: If this is a non-temporal load and the target has an instruction | |||
| 5411 | // for it, it should not be folded. See "useNonTemporalLoad()". | |||
| 5412 | ||||
| 5413 | return true; | |||
| 5414 | } | |||
| 5415 | ||||
| 5416 | bool X86::mayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, | |||
| 5417 | const X86Subtarget &Subtarget, | |||
| 5418 | bool AssumeSingleUse) { | |||
| 5419 | assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX for broadcast from memory" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX for broadcast from memory\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5419, __extension__ __PRETTY_FUNCTION__)); | |||
| 5420 | if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) | |||
| 5421 | return false; | |||
| 5422 | ||||
| 5423 | // We can not replace a wide volatile load with a broadcast-from-memory, | |||
| 5424 | // because that would narrow the load, which isn't legal for volatiles. | |||
| 5425 | auto *Ld = cast<LoadSDNode>(Op.getNode()); | |||
| 5426 | return !Ld->isVolatile() || | |||
| 5427 | Ld->getValueSizeInBits(0) == EltVT.getScalarSizeInBits(); | |||
| 5428 | } | |||
| 5429 | ||||
| 5430 | bool X86::mayFoldIntoStore(SDValue Op) { | |||
| 5431 | return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); | |||
| 5432 | } | |||
| 5433 | ||||
| 5434 | bool X86::mayFoldIntoZeroExtend(SDValue Op) { | |||
| 5435 | if (Op.hasOneUse()) { | |||
| 5436 | unsigned Opcode = Op.getNode()->use_begin()->getOpcode(); | |||
| 5437 | return (ISD::ZERO_EXTEND == Opcode); | |||
| 5438 | } | |||
| 5439 | return false; | |||
| 5440 | } | |||
| 5441 | ||||
| 5442 | static bool isTargetShuffle(unsigned Opcode) { | |||
| 5443 | switch(Opcode) { | |||
| 5444 | default: return false; | |||
| 5445 | case X86ISD::BLENDI: | |||
| 5446 | case X86ISD::PSHUFB: | |||
| 5447 | case X86ISD::PSHUFD: | |||
| 5448 | case X86ISD::PSHUFHW: | |||
| 5449 | case X86ISD::PSHUFLW: | |||
| 5450 | case X86ISD::SHUFP: | |||
| 5451 | case X86ISD::INSERTPS: | |||
| 5452 | case X86ISD::EXTRQI: | |||
| 5453 | case X86ISD::INSERTQI: | |||
| 5454 | case X86ISD::VALIGN: | |||
| 5455 | case X86ISD::PALIGNR: | |||
| 5456 | case X86ISD::VSHLDQ: | |||
| 5457 | case X86ISD::VSRLDQ: | |||
| 5458 | case X86ISD::MOVLHPS: | |||
| 5459 | case X86ISD::MOVHLPS: | |||
| 5460 | case X86ISD::MOVSHDUP: | |||
| 5461 | case X86ISD::MOVSLDUP: | |||
| 5462 | case X86ISD::MOVDDUP: | |||
| 5463 | case X86ISD::MOVSS: | |||
| 5464 | case X86ISD::MOVSD: | |||
| 5465 | case X86ISD::MOVSH: | |||
| 5466 | case X86ISD::UNPCKL: | |||
| 5467 | case X86ISD::UNPCKH: | |||
| 5468 | case X86ISD::VBROADCAST: | |||
| 5469 | case X86ISD::VPERMILPI: | |||
| 5470 | case X86ISD::VPERMILPV: | |||
| 5471 | case X86ISD::VPERM2X128: | |||
| 5472 | case X86ISD::SHUF128: | |||
| 5473 | case X86ISD::VPERMIL2: | |||
| 5474 | case X86ISD::VPERMI: | |||
| 5475 | case X86ISD::VPPERM: | |||
| 5476 | case X86ISD::VPERMV: | |||
| 5477 | case X86ISD::VPERMV3: | |||
| 5478 | case X86ISD::VZEXT_MOVL: | |||
| 5479 | return true; | |||
| 5480 | } | |||
| 5481 | } | |||
| 5482 | ||||
| 5483 | static bool isTargetShuffleVariableMask(unsigned Opcode) { | |||
| 5484 | switch (Opcode) { | |||
| 5485 | default: return false; | |||
| 5486 | // Target Shuffles. | |||
| 5487 | case X86ISD::PSHUFB: | |||
| 5488 | case X86ISD::VPERMILPV: | |||
| 5489 | case X86ISD::VPERMIL2: | |||
| 5490 | case X86ISD::VPPERM: | |||
| 5491 | case X86ISD::VPERMV: | |||
| 5492 | case X86ISD::VPERMV3: | |||
| 5493 | return true; | |||
| 5494 | // 'Faux' Target Shuffles. | |||
| 5495 | case ISD::OR: | |||
| 5496 | case ISD::AND: | |||
| 5497 | case X86ISD::ANDNP: | |||
| 5498 | return true; | |||
| 5499 | } | |||
| 5500 | } | |||
| 5501 | ||||
| 5502 | SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { | |||
| 5503 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 5504 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 5505 | X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 5506 | int ReturnAddrIndex = FuncInfo->getRAIndex(); | |||
| 5507 | ||||
| 5508 | if (ReturnAddrIndex == 0) { | |||
| 5509 | // Set up a frame object for the return address. | |||
| 5510 | unsigned SlotSize = RegInfo->getSlotSize(); | |||
| 5511 | ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize, | |||
| 5512 | -(int64_t)SlotSize, | |||
| 5513 | false); | |||
| 5514 | FuncInfo->setRAIndex(ReturnAddrIndex); | |||
| 5515 | } | |||
| 5516 | ||||
| 5517 | return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout())); | |||
| 5518 | } | |||
| 5519 | ||||
| 5520 | bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, | |||
| 5521 | bool hasSymbolicDisplacement) { | |||
| 5522 | // Offset should fit into 32 bit immediate field. | |||
| 5523 | if (!isInt<32>(Offset)) | |||
| 5524 | return false; | |||
| 5525 | ||||
| 5526 | // If we don't have a symbolic displacement - we don't have any extra | |||
| 5527 | // restrictions. | |||
| 5528 | if (!hasSymbolicDisplacement) | |||
| 5529 | return true; | |||
| 5530 | ||||
| 5531 | // FIXME: Some tweaks might be needed for medium code model. | |||
| 5532 | if (M != CodeModel::Small && M != CodeModel::Kernel) | |||
| 5533 | return false; | |||
| 5534 | ||||
| 5535 | // For small code model we assume that latest object is 16MB before end of 31 | |||
| 5536 | // bits boundary. We may also accept pretty large negative constants knowing | |||
| 5537 | // that all objects are in the positive half of address space. | |||
| 5538 | if (M == CodeModel::Small && Offset < 16*1024*1024) | |||
| 5539 | return true; | |||
| 5540 | ||||
| 5541 | // For kernel code model we know that all object resist in the negative half | |||
| 5542 | // of 32bits address space. We may not accept negative offsets, since they may | |||
| 5543 | // be just off and we may accept pretty large positive ones. | |||
| 5544 | if (M == CodeModel::Kernel && Offset >= 0) | |||
| 5545 | return true; | |||
| 5546 | ||||
| 5547 | return false; | |||
| 5548 | } | |||
| 5549 | ||||
| 5550 | /// Determines whether the callee is required to pop its own arguments. | |||
| 5551 | /// Callee pop is necessary to support tail calls. | |||
| 5552 | bool X86::isCalleePop(CallingConv::ID CallingConv, | |||
| 5553 | bool is64Bit, bool IsVarArg, bool GuaranteeTCO) { | |||
| 5554 | // If GuaranteeTCO is true, we force some calls to be callee pop so that we | |||
| 5555 | // can guarantee TCO. | |||
| 5556 | if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO)) | |||
| 5557 | return true; | |||
| 5558 | ||||
| 5559 | switch (CallingConv) { | |||
| 5560 | default: | |||
| 5561 | return false; | |||
| 5562 | case CallingConv::X86_StdCall: | |||
| 5563 | case CallingConv::X86_FastCall: | |||
| 5564 | case CallingConv::X86_ThisCall: | |||
| 5565 | case CallingConv::X86_VectorCall: | |||
| 5566 | return !is64Bit; | |||
| 5567 | } | |||
| 5568 | } | |||
| 5569 | ||||
| 5570 | /// Return true if the condition is an signed comparison operation. | |||
| 5571 | static bool isX86CCSigned(unsigned X86CC) { | |||
| 5572 | switch (X86CC) { | |||
| 5573 | default: | |||
| 5574 | llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5574); | |||
| 5575 | case X86::COND_E: | |||
| 5576 | case X86::COND_NE: | |||
| 5577 | case X86::COND_B: | |||
| 5578 | case X86::COND_A: | |||
| 5579 | case X86::COND_BE: | |||
| 5580 | case X86::COND_AE: | |||
| 5581 | return false; | |||
| 5582 | case X86::COND_G: | |||
| 5583 | case X86::COND_GE: | |||
| 5584 | case X86::COND_L: | |||
| 5585 | case X86::COND_LE: | |||
| 5586 | return true; | |||
| 5587 | } | |||
| 5588 | } | |||
| 5589 | ||||
| 5590 | static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) { | |||
| 5591 | switch (SetCCOpcode) { | |||
| 5592 | default: llvm_unreachable("Invalid integer condition!")::llvm::llvm_unreachable_internal("Invalid integer condition!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5592); | |||
| 5593 | case ISD::SETEQ: return X86::COND_E; | |||
| 5594 | case ISD::SETGT: return X86::COND_G; | |||
| 5595 | case ISD::SETGE: return X86::COND_GE; | |||
| 5596 | case ISD::SETLT: return X86::COND_L; | |||
| 5597 | case ISD::SETLE: return X86::COND_LE; | |||
| 5598 | case ISD::SETNE: return X86::COND_NE; | |||
| 5599 | case ISD::SETULT: return X86::COND_B; | |||
| 5600 | case ISD::SETUGT: return X86::COND_A; | |||
| 5601 | case ISD::SETULE: return X86::COND_BE; | |||
| 5602 | case ISD::SETUGE: return X86::COND_AE; | |||
| 5603 | } | |||
| 5604 | } | |||
| 5605 | ||||
| 5606 | /// Do a one-to-one translation of a ISD::CondCode to the X86-specific | |||
| 5607 | /// condition code, returning the condition code and the LHS/RHS of the | |||
| 5608 | /// comparison to make. | |||
| 5609 | static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL, | |||
| 5610 | bool isFP, SDValue &LHS, SDValue &RHS, | |||
| 5611 | SelectionDAG &DAG) { | |||
| 5612 | if (!isFP) { | |||
| 5613 | if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) { | |||
| 5614 | if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnes()) { | |||
| 5615 | // X > -1 -> X == 0, jump !sign. | |||
| 5616 | RHS = DAG.getConstant(0, DL, RHS.getValueType()); | |||
| 5617 | return X86::COND_NS; | |||
| 5618 | } | |||
| 5619 | if (SetCCOpcode == ISD::SETLT && RHSC->isZero()) { | |||
| 5620 | // X < 0 -> X == 0, jump on sign. | |||
| 5621 | return X86::COND_S; | |||
| 5622 | } | |||
| 5623 | if (SetCCOpcode == ISD::SETGE && RHSC->isZero()) { | |||
| 5624 | // X >= 0 -> X == 0, jump on !sign. | |||
| 5625 | return X86::COND_NS; | |||
| 5626 | } | |||
| 5627 | if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) { | |||
| 5628 | // X < 1 -> X <= 0 | |||
| 5629 | RHS = DAG.getConstant(0, DL, RHS.getValueType()); | |||
| 5630 | return X86::COND_LE; | |||
| 5631 | } | |||
| 5632 | } | |||
| 5633 | ||||
| 5634 | return TranslateIntegerX86CC(SetCCOpcode); | |||
| 5635 | } | |||
| 5636 | ||||
| 5637 | // First determine if it is required or is profitable to flip the operands. | |||
| 5638 | ||||
| 5639 | // If LHS is a foldable load, but RHS is not, flip the condition. | |||
| 5640 | if (ISD::isNON_EXTLoad(LHS.getNode()) && | |||
| 5641 | !ISD::isNON_EXTLoad(RHS.getNode())) { | |||
| 5642 | SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); | |||
| 5643 | std::swap(LHS, RHS); | |||
| 5644 | } | |||
| 5645 | ||||
| 5646 | switch (SetCCOpcode) { | |||
| 5647 | default: break; | |||
| 5648 | case ISD::SETOLT: | |||
| 5649 | case ISD::SETOLE: | |||
| 5650 | case ISD::SETUGT: | |||
| 5651 | case ISD::SETUGE: | |||
| 5652 | std::swap(LHS, RHS); | |||
| 5653 | break; | |||
| 5654 | } | |||
| 5655 | ||||
| 5656 | // On a floating point condition, the flags are set as follows: | |||
| 5657 | // ZF PF CF op | |||
| 5658 | // 0 | 0 | 0 | X > Y | |||
| 5659 | // 0 | 0 | 1 | X < Y | |||
| 5660 | // 1 | 0 | 0 | X == Y | |||
| 5661 | // 1 | 1 | 1 | unordered | |||
| 5662 | switch (SetCCOpcode) { | |||
| 5663 | default: llvm_unreachable("Condcode should be pre-legalized away")::llvm::llvm_unreachable_internal("Condcode should be pre-legalized away" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5663); | |||
| 5664 | case ISD::SETUEQ: | |||
| 5665 | case ISD::SETEQ: return X86::COND_E; | |||
| 5666 | case ISD::SETOLT: // flipped | |||
| 5667 | case ISD::SETOGT: | |||
| 5668 | case ISD::SETGT: return X86::COND_A; | |||
| 5669 | case ISD::SETOLE: // flipped | |||
| 5670 | case ISD::SETOGE: | |||
| 5671 | case ISD::SETGE: return X86::COND_AE; | |||
| 5672 | case ISD::SETUGT: // flipped | |||
| 5673 | case ISD::SETULT: | |||
| 5674 | case ISD::SETLT: return X86::COND_B; | |||
| 5675 | case ISD::SETUGE: // flipped | |||
| 5676 | case ISD::SETULE: | |||
| 5677 | case ISD::SETLE: return X86::COND_BE; | |||
| 5678 | case ISD::SETONE: | |||
| 5679 | case ISD::SETNE: return X86::COND_NE; | |||
| 5680 | case ISD::SETUO: return X86::COND_P; | |||
| 5681 | case ISD::SETO: return X86::COND_NP; | |||
| 5682 | case ISD::SETOEQ: | |||
| 5683 | case ISD::SETUNE: return X86::COND_INVALID; | |||
| 5684 | } | |||
| 5685 | } | |||
| 5686 | ||||
| 5687 | /// Is there a floating point cmov for the specific X86 condition code? | |||
| 5688 | /// Current x86 isa includes the following FP cmov instructions: | |||
| 5689 | /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. | |||
| 5690 | static bool hasFPCMov(unsigned X86CC) { | |||
| 5691 | switch (X86CC) { | |||
| 5692 | default: | |||
| 5693 | return false; | |||
| 5694 | case X86::COND_B: | |||
| 5695 | case X86::COND_BE: | |||
| 5696 | case X86::COND_E: | |||
| 5697 | case X86::COND_P: | |||
| 5698 | case X86::COND_A: | |||
| 5699 | case X86::COND_AE: | |||
| 5700 | case X86::COND_NE: | |||
| 5701 | case X86::COND_NP: | |||
| 5702 | return true; | |||
| 5703 | } | |||
| 5704 | } | |||
| 5705 | ||||
| 5706 | static bool useVPTERNLOG(const X86Subtarget &Subtarget, MVT VT) { | |||
| 5707 | return Subtarget.hasVLX() || Subtarget.canExtendTo512DQ() || | |||
| 5708 | VT.is512BitVector(); | |||
| 5709 | } | |||
| 5710 | ||||
| 5711 | bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, | |||
| 5712 | const CallInst &I, | |||
| 5713 | MachineFunction &MF, | |||
| 5714 | unsigned Intrinsic) const { | |||
| 5715 | Info.flags = MachineMemOperand::MONone; | |||
| 5716 | Info.offset = 0; | |||
| 5717 | ||||
| 5718 | const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); | |||
| 5719 | if (!IntrData) { | |||
| 5720 | switch (Intrinsic) { | |||
| 5721 | case Intrinsic::x86_aesenc128kl: | |||
| 5722 | case Intrinsic::x86_aesdec128kl: | |||
| 5723 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5724 | Info.ptrVal = I.getArgOperand(1); | |||
| 5725 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); | |||
| 5726 | Info.align = Align(1); | |||
| 5727 | Info.flags |= MachineMemOperand::MOLoad; | |||
| 5728 | return true; | |||
| 5729 | case Intrinsic::x86_aesenc256kl: | |||
| 5730 | case Intrinsic::x86_aesdec256kl: | |||
| 5731 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5732 | Info.ptrVal = I.getArgOperand(1); | |||
| 5733 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); | |||
| 5734 | Info.align = Align(1); | |||
| 5735 | Info.flags |= MachineMemOperand::MOLoad; | |||
| 5736 | return true; | |||
| 5737 | case Intrinsic::x86_aesencwide128kl: | |||
| 5738 | case Intrinsic::x86_aesdecwide128kl: | |||
| 5739 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5740 | Info.ptrVal = I.getArgOperand(0); | |||
| 5741 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 48); | |||
| 5742 | Info.align = Align(1); | |||
| 5743 | Info.flags |= MachineMemOperand::MOLoad; | |||
| 5744 | return true; | |||
| 5745 | case Intrinsic::x86_aesencwide256kl: | |||
| 5746 | case Intrinsic::x86_aesdecwide256kl: | |||
| 5747 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5748 | Info.ptrVal = I.getArgOperand(0); | |||
| 5749 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), 64); | |||
| 5750 | Info.align = Align(1); | |||
| 5751 | Info.flags |= MachineMemOperand::MOLoad; | |||
| 5752 | return true; | |||
| 5753 | case Intrinsic::x86_cmpccxadd32: | |||
| 5754 | case Intrinsic::x86_cmpccxadd64: | |||
| 5755 | case Intrinsic::x86_atomic_bts: | |||
| 5756 | case Intrinsic::x86_atomic_btc: | |||
| 5757 | case Intrinsic::x86_atomic_btr: { | |||
| 5758 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5759 | Info.ptrVal = I.getArgOperand(0); | |||
| 5760 | unsigned Size = I.getType()->getScalarSizeInBits(); | |||
| 5761 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); | |||
| 5762 | Info.align = Align(Size); | |||
| 5763 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | | |||
| 5764 | MachineMemOperand::MOVolatile; | |||
| 5765 | return true; | |||
| 5766 | } | |||
| 5767 | case Intrinsic::x86_atomic_bts_rm: | |||
| 5768 | case Intrinsic::x86_atomic_btc_rm: | |||
| 5769 | case Intrinsic::x86_atomic_btr_rm: { | |||
| 5770 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5771 | Info.ptrVal = I.getArgOperand(0); | |||
| 5772 | unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); | |||
| 5773 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); | |||
| 5774 | Info.align = Align(Size); | |||
| 5775 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | | |||
| 5776 | MachineMemOperand::MOVolatile; | |||
| 5777 | return true; | |||
| 5778 | } | |||
| 5779 | case Intrinsic::x86_aadd32: | |||
| 5780 | case Intrinsic::x86_aadd64: | |||
| 5781 | case Intrinsic::x86_aand32: | |||
| 5782 | case Intrinsic::x86_aand64: | |||
| 5783 | case Intrinsic::x86_aor32: | |||
| 5784 | case Intrinsic::x86_aor64: | |||
| 5785 | case Intrinsic::x86_axor32: | |||
| 5786 | case Intrinsic::x86_axor64: | |||
| 5787 | case Intrinsic::x86_atomic_add_cc: | |||
| 5788 | case Intrinsic::x86_atomic_sub_cc: | |||
| 5789 | case Intrinsic::x86_atomic_or_cc: | |||
| 5790 | case Intrinsic::x86_atomic_and_cc: | |||
| 5791 | case Intrinsic::x86_atomic_xor_cc: { | |||
| 5792 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5793 | Info.ptrVal = I.getArgOperand(0); | |||
| 5794 | unsigned Size = I.getArgOperand(1)->getType()->getScalarSizeInBits(); | |||
| 5795 | Info.memVT = EVT::getIntegerVT(I.getType()->getContext(), Size); | |||
| 5796 | Info.align = Align(Size); | |||
| 5797 | Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore | | |||
| 5798 | MachineMemOperand::MOVolatile; | |||
| 5799 | return true; | |||
| 5800 | } | |||
| 5801 | } | |||
| 5802 | return false; | |||
| 5803 | } | |||
| 5804 | ||||
| 5805 | switch (IntrData->Type) { | |||
| 5806 | case TRUNCATE_TO_MEM_VI8: | |||
| 5807 | case TRUNCATE_TO_MEM_VI16: | |||
| 5808 | case TRUNCATE_TO_MEM_VI32: { | |||
| 5809 | Info.opc = ISD::INTRINSIC_VOID; | |||
| 5810 | Info.ptrVal = I.getArgOperand(0); | |||
| 5811 | MVT VT = MVT::getVT(I.getArgOperand(1)->getType()); | |||
| 5812 | MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE; | |||
| 5813 | if (IntrData->Type == TRUNCATE_TO_MEM_VI8) | |||
| 5814 | ScalarVT = MVT::i8; | |||
| 5815 | else if (IntrData->Type == TRUNCATE_TO_MEM_VI16) | |||
| 5816 | ScalarVT = MVT::i16; | |||
| 5817 | else if (IntrData->Type == TRUNCATE_TO_MEM_VI32) | |||
| 5818 | ScalarVT = MVT::i32; | |||
| 5819 | ||||
| 5820 | Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); | |||
| 5821 | Info.align = Align(1); | |||
| 5822 | Info.flags |= MachineMemOperand::MOStore; | |||
| 5823 | break; | |||
| 5824 | } | |||
| 5825 | case GATHER: | |||
| 5826 | case GATHER_AVX2: { | |||
| 5827 | Info.opc = ISD::INTRINSIC_W_CHAIN; | |||
| 5828 | Info.ptrVal = nullptr; | |||
| 5829 | MVT DataVT = MVT::getVT(I.getType()); | |||
| 5830 | MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); | |||
| 5831 | unsigned NumElts = std::min(DataVT.getVectorNumElements(), | |||
| 5832 | IndexVT.getVectorNumElements()); | |||
| 5833 | Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); | |||
| 5834 | Info.align = Align(1); | |||
| 5835 | Info.flags |= MachineMemOperand::MOLoad; | |||
| 5836 | break; | |||
| 5837 | } | |||
| 5838 | case SCATTER: { | |||
| 5839 | Info.opc = ISD::INTRINSIC_VOID; | |||
| 5840 | Info.ptrVal = nullptr; | |||
| 5841 | MVT DataVT = MVT::getVT(I.getArgOperand(3)->getType()); | |||
| 5842 | MVT IndexVT = MVT::getVT(I.getArgOperand(2)->getType()); | |||
| 5843 | unsigned NumElts = std::min(DataVT.getVectorNumElements(), | |||
| 5844 | IndexVT.getVectorNumElements()); | |||
| 5845 | Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts); | |||
| 5846 | Info.align = Align(1); | |||
| 5847 | Info.flags |= MachineMemOperand::MOStore; | |||
| 5848 | break; | |||
| 5849 | } | |||
| 5850 | default: | |||
| 5851 | return false; | |||
| 5852 | } | |||
| 5853 | ||||
| 5854 | return true; | |||
| 5855 | } | |||
| 5856 | ||||
| 5857 | /// Returns true if the target can instruction select the | |||
| 5858 | /// specified FP immediate natively. If false, the legalizer will | |||
| 5859 | /// materialize the FP immediate as a load from a constant pool. | |||
| 5860 | bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT, | |||
| 5861 | bool ForCodeSize) const { | |||
| 5862 | for (const APFloat &FPImm : LegalFPImmediates) | |||
| 5863 | if (Imm.bitwiseIsEqual(FPImm)) | |||
| 5864 | return true; | |||
| 5865 | return false; | |||
| 5866 | } | |||
| 5867 | ||||
| 5868 | bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load, | |||
| 5869 | ISD::LoadExtType ExtTy, | |||
| 5870 | EVT NewVT) const { | |||
| 5871 | assert(cast<LoadSDNode>(Load)->isSimple() && "illegal to narrow")(static_cast <bool> (cast<LoadSDNode>(Load)->isSimple () && "illegal to narrow") ? void (0) : __assert_fail ("cast<LoadSDNode>(Load)->isSimple() && \"illegal to narrow\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 5871, __extension__ __PRETTY_FUNCTION__)); | |||
| 5872 | ||||
| 5873 | // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF | |||
| 5874 | // relocation target a movq or addq instruction: don't let the load shrink. | |||
| 5875 | SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr(); | |||
| 5876 | if (BasePtr.getOpcode() == X86ISD::WrapperRIP) | |||
| 5877 | if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0))) | |||
| 5878 | return GA->getTargetFlags() != X86II::MO_GOTTPOFF; | |||
| 5879 | ||||
| 5880 | // If this is an (1) AVX vector load with (2) multiple uses and (3) all of | |||
| 5881 | // those uses are extracted directly into a store, then the extract + store | |||
| 5882 | // can be store-folded. Therefore, it's probably not worth splitting the load. | |||
| 5883 | EVT VT = Load->getValueType(0); | |||
| 5884 | if ((VT.is256BitVector() || VT.is512BitVector()) && !Load->hasOneUse()) { | |||
| 5885 | for (auto UI = Load->use_begin(), UE = Load->use_end(); UI != UE; ++UI) { | |||
| 5886 | // Skip uses of the chain value. Result 0 of the node is the load value. | |||
| 5887 | if (UI.getUse().getResNo() != 0) | |||
| 5888 | continue; | |||
| 5889 | ||||
| 5890 | // If this use is not an extract + store, it's probably worth splitting. | |||
| 5891 | if (UI->getOpcode() != ISD::EXTRACT_SUBVECTOR || !UI->hasOneUse() || | |||
| 5892 | UI->use_begin()->getOpcode() != ISD::STORE) | |||
| 5893 | return true; | |||
| 5894 | } | |||
| 5895 | // All non-chain uses are extract + store. | |||
| 5896 | return false; | |||
| 5897 | } | |||
| 5898 | ||||
| 5899 | return true; | |||
| 5900 | } | |||
| 5901 | ||||
| 5902 | /// Returns true if it is beneficial to convert a load of a constant | |||
| 5903 | /// to just the constant itself. | |||
| 5904 | bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, | |||
| 5905 | Type *Ty) const { | |||
| 5906 | assert(Ty->isIntegerTy())(static_cast <bool> (Ty->isIntegerTy()) ? void (0) : __assert_fail ("Ty->isIntegerTy()", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 5906, __extension__ __PRETTY_FUNCTION__)); | |||
| 5907 | ||||
| 5908 | unsigned BitSize = Ty->getPrimitiveSizeInBits(); | |||
| 5909 | if (BitSize == 0 || BitSize > 64) | |||
| 5910 | return false; | |||
| 5911 | return true; | |||
| 5912 | } | |||
| 5913 | ||||
| 5914 | bool X86TargetLowering::reduceSelectOfFPConstantLoads(EVT CmpOpVT) const { | |||
| 5915 | // If we are using XMM registers in the ABI and the condition of the select is | |||
| 5916 | // a floating-point compare and we have blendv or conditional move, then it is | |||
| 5917 | // cheaper to select instead of doing a cross-register move and creating a | |||
| 5918 | // load that depends on the compare result. | |||
| 5919 | bool IsFPSetCC = CmpOpVT.isFloatingPoint() && CmpOpVT != MVT::f128; | |||
| 5920 | return !IsFPSetCC || !Subtarget.isTarget64BitLP64() || !Subtarget.hasAVX(); | |||
| 5921 | } | |||
| 5922 | ||||
| 5923 | bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { | |||
| 5924 | // TODO: It might be a win to ease or lift this restriction, but the generic | |||
| 5925 | // folds in DAGCombiner conflict with vector folds for an AVX512 target. | |||
| 5926 | if (VT.isVector() && Subtarget.hasAVX512()) | |||
| 5927 | return false; | |||
| 5928 | ||||
| 5929 | return true; | |||
| 5930 | } | |||
| 5931 | ||||
| 5932 | bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT, | |||
| 5933 | SDValue C) const { | |||
| 5934 | // TODO: We handle scalars using custom code, but generic combining could make | |||
| 5935 | // that unnecessary. | |||
| 5936 | APInt MulC; | |||
| 5937 | if (!ISD::isConstantSplatVector(C.getNode(), MulC)) | |||
| 5938 | return false; | |||
| 5939 | ||||
| 5940 | // Find the type this will be legalized too. Otherwise we might prematurely | |||
| 5941 | // convert this to shl+add/sub and then still have to type legalize those ops. | |||
| 5942 | // Another choice would be to defer the decision for illegal types until | |||
| 5943 | // after type legalization. But constant splat vectors of i64 can't make it | |||
| 5944 | // through type legalization on 32-bit targets so we would need to special | |||
| 5945 | // case vXi64. | |||
| 5946 | while (getTypeAction(Context, VT) != TypeLegal) | |||
| 5947 | VT = getTypeToTransformTo(Context, VT); | |||
| 5948 | ||||
| 5949 | // If vector multiply is legal, assume that's faster than shl + add/sub. | |||
| 5950 | // Multiply is a complex op with higher latency and lower throughput in | |||
| 5951 | // most implementations, sub-vXi32 vector multiplies are always fast, | |||
| 5952 | // vXi32 mustn't have a SlowMULLD implementation, and anything larger (vXi64) | |||
| 5953 | // is always going to be slow. | |||
| 5954 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 5955 | if (isOperationLegal(ISD::MUL, VT) && EltSizeInBits <= 32 && | |||
| 5956 | (EltSizeInBits != 32 || !Subtarget.isPMULLDSlow())) | |||
| 5957 | return false; | |||
| 5958 | ||||
| 5959 | // shl+add, shl+sub, shl+add+neg | |||
| 5960 | return (MulC + 1).isPowerOf2() || (MulC - 1).isPowerOf2() || | |||
| 5961 | (1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2(); | |||
| 5962 | } | |||
| 5963 | ||||
| 5964 | bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, | |||
| 5965 | unsigned Index) const { | |||
| 5966 | if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) | |||
| 5967 | return false; | |||
| 5968 | ||||
| 5969 | // Mask vectors support all subregister combinations and operations that | |||
| 5970 | // extract half of vector. | |||
| 5971 | if (ResVT.getVectorElementType() == MVT::i1) | |||
| 5972 | return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && | |||
| 5973 | (Index == ResVT.getVectorNumElements())); | |||
| 5974 | ||||
| 5975 | return (Index % ResVT.getVectorNumElements()) == 0; | |||
| 5976 | } | |||
| 5977 | ||||
| 5978 | bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const { | |||
| 5979 | unsigned Opc = VecOp.getOpcode(); | |||
| 5980 | ||||
| 5981 | // Assume target opcodes can't be scalarized. | |||
| 5982 | // TODO - do we have any exceptions? | |||
| 5983 | if (Opc >= ISD::BUILTIN_OP_END) | |||
| 5984 | return false; | |||
| 5985 | ||||
| 5986 | // If the vector op is not supported, try to convert to scalar. | |||
| 5987 | EVT VecVT = VecOp.getValueType(); | |||
| 5988 | if (!isOperationLegalOrCustomOrPromote(Opc, VecVT)) | |||
| 5989 | return true; | |||
| 5990 | ||||
| 5991 | // If the vector op is supported, but the scalar op is not, the transform may | |||
| 5992 | // not be worthwhile. | |||
| 5993 | EVT ScalarVT = VecVT.getScalarType(); | |||
| 5994 | return isOperationLegalOrCustomOrPromote(Opc, ScalarVT); | |||
| 5995 | } | |||
| 5996 | ||||
| 5997 | bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT, | |||
| 5998 | bool) const { | |||
| 5999 | // TODO: Allow vectors? | |||
| 6000 | if (VT.isVector()) | |||
| 6001 | return false; | |||
| 6002 | return VT.isSimple() || !isOperationExpand(Opcode, VT); | |||
| 6003 | } | |||
| 6004 | ||||
| 6005 | bool X86TargetLowering::isCheapToSpeculateCttz(Type *Ty) const { | |||
| 6006 | // Speculate cttz only if we can directly use TZCNT or can promote to i32. | |||
| 6007 | return Subtarget.hasBMI() || | |||
| 6008 | (!Ty->isVectorTy() && Ty->getScalarSizeInBits() < 32); | |||
| 6009 | } | |||
| 6010 | ||||
| 6011 | bool X86TargetLowering::isCheapToSpeculateCtlz(Type *Ty) const { | |||
| 6012 | // Speculate ctlz only if we can directly use LZCNT. | |||
| 6013 | return Subtarget.hasLZCNT(); | |||
| 6014 | } | |||
| 6015 | ||||
| 6016 | bool X86TargetLowering::ShouldShrinkFPConstant(EVT VT) const { | |||
| 6017 | // Don't shrink FP constpool if SSE2 is available since cvtss2sd is more | |||
| 6018 | // expensive than a straight movsd. On the other hand, it's important to | |||
| 6019 | // shrink long double fp constant since fldt is very slow. | |||
| 6020 | return !Subtarget.hasSSE2() || VT == MVT::f80; | |||
| 6021 | } | |||
| 6022 | ||||
| 6023 | bool X86TargetLowering::isScalarFPTypeInSSEReg(EVT VT) const { | |||
| 6024 | return (VT == MVT::f64 && Subtarget.hasSSE2()) || | |||
| 6025 | (VT == MVT::f32 && Subtarget.hasSSE1()) || VT == MVT::f16; | |||
| 6026 | } | |||
| 6027 | ||||
| 6028 | bool X86TargetLowering::isLoadBitCastBeneficial(EVT LoadVT, EVT BitcastVT, | |||
| 6029 | const SelectionDAG &DAG, | |||
| 6030 | const MachineMemOperand &MMO) const { | |||
| 6031 | if (!Subtarget.hasAVX512() && !LoadVT.isVector() && BitcastVT.isVector() && | |||
| 6032 | BitcastVT.getVectorElementType() == MVT::i1) | |||
| 6033 | return false; | |||
| 6034 | ||||
| 6035 | if (!Subtarget.hasDQI() && BitcastVT == MVT::v8i1 && LoadVT == MVT::i8) | |||
| 6036 | return false; | |||
| 6037 | ||||
| 6038 | // If both types are legal vectors, it's always ok to convert them. | |||
| 6039 | if (LoadVT.isVector() && BitcastVT.isVector() && | |||
| 6040 | isTypeLegal(LoadVT) && isTypeLegal(BitcastVT)) | |||
| 6041 | return true; | |||
| 6042 | ||||
| 6043 | return TargetLowering::isLoadBitCastBeneficial(LoadVT, BitcastVT, DAG, MMO); | |||
| 6044 | } | |||
| 6045 | ||||
| 6046 | bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, | |||
| 6047 | const MachineFunction &MF) const { | |||
| 6048 | // Do not merge to float value size (128 bytes) if no implicit | |||
| 6049 | // float attribute is set. | |||
| 6050 | bool NoFloat = MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat); | |||
| 6051 | ||||
| 6052 | if (NoFloat) { | |||
| 6053 | unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; | |||
| 6054 | return (MemVT.getSizeInBits() <= MaxIntSize); | |||
| 6055 | } | |||
| 6056 | // Make sure we don't merge greater than our preferred vector | |||
| 6057 | // width. | |||
| 6058 | if (MemVT.getSizeInBits() > Subtarget.getPreferVectorWidth()) | |||
| 6059 | return false; | |||
| 6060 | ||||
| 6061 | return true; | |||
| 6062 | } | |||
| 6063 | ||||
| 6064 | bool X86TargetLowering::isCtlzFast() const { | |||
| 6065 | return Subtarget.hasFastLZCNT(); | |||
| 6066 | } | |||
| 6067 | ||||
| 6068 | bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial( | |||
| 6069 | const Instruction &AndI) const { | |||
| 6070 | return true; | |||
| 6071 | } | |||
| 6072 | ||||
| 6073 | bool X86TargetLowering::hasAndNotCompare(SDValue Y) const { | |||
| 6074 | EVT VT = Y.getValueType(); | |||
| 6075 | ||||
| 6076 | if (VT.isVector()) | |||
| 6077 | return false; | |||
| 6078 | ||||
| 6079 | if (!Subtarget.hasBMI()) | |||
| 6080 | return false; | |||
| 6081 | ||||
| 6082 | // There are only 32-bit and 64-bit forms for 'andn'. | |||
| 6083 | if (VT != MVT::i32 && VT != MVT::i64) | |||
| 6084 | return false; | |||
| 6085 | ||||
| 6086 | return !isa<ConstantSDNode>(Y); | |||
| 6087 | } | |||
| 6088 | ||||
| 6089 | bool X86TargetLowering::hasAndNot(SDValue Y) const { | |||
| 6090 | EVT VT = Y.getValueType(); | |||
| 6091 | ||||
| 6092 | if (!VT.isVector()) | |||
| 6093 | return hasAndNotCompare(Y); | |||
| 6094 | ||||
| 6095 | // Vector. | |||
| 6096 | ||||
| 6097 | if (!Subtarget.hasSSE1() || VT.getSizeInBits() < 128) | |||
| 6098 | return false; | |||
| 6099 | ||||
| 6100 | if (VT == MVT::v4i32) | |||
| 6101 | return true; | |||
| 6102 | ||||
| 6103 | return Subtarget.hasSSE2(); | |||
| 6104 | } | |||
| 6105 | ||||
| 6106 | bool X86TargetLowering::hasBitTest(SDValue X, SDValue Y) const { | |||
| 6107 | return X.getValueType().isScalarInteger(); // 'bt' | |||
| 6108 | } | |||
| 6109 | ||||
| 6110 | bool X86TargetLowering:: | |||
| 6111 | shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( | |||
| 6112 | SDValue X, ConstantSDNode *XC, ConstantSDNode *CC, SDValue Y, | |||
| 6113 | unsigned OldShiftOpcode, unsigned NewShiftOpcode, | |||
| 6114 | SelectionDAG &DAG) const { | |||
| 6115 | // Does baseline recommend not to perform the fold by default? | |||
| 6116 | if (!TargetLowering::shouldProduceAndByConstByHoistingConstFromShiftsLHSOfAnd( | |||
| 6117 | X, XC, CC, Y, OldShiftOpcode, NewShiftOpcode, DAG)) | |||
| 6118 | return false; | |||
| 6119 | // For scalars this transform is always beneficial. | |||
| 6120 | if (X.getValueType().isScalarInteger()) | |||
| 6121 | return true; | |||
| 6122 | // If all the shift amounts are identical, then transform is beneficial even | |||
| 6123 | // with rudimentary SSE2 shifts. | |||
| 6124 | if (DAG.isSplatValue(Y, /*AllowUndefs=*/true)) | |||
| 6125 | return true; | |||
| 6126 | // If we have AVX2 with it's powerful shift operations, then it's also good. | |||
| 6127 | if (Subtarget.hasAVX2()) | |||
| 6128 | return true; | |||
| 6129 | // Pre-AVX2 vector codegen for this pattern is best for variant with 'shl'. | |||
| 6130 | return NewShiftOpcode == ISD::SHL; | |||
| 6131 | } | |||
| 6132 | ||||
| 6133 | bool X86TargetLowering::preferScalarizeSplat(SDNode *N) const { | |||
| 6134 | return N->getOpcode() != ISD::FP_EXTEND; | |||
| 6135 | } | |||
| 6136 | ||||
| 6137 | bool X86TargetLowering::shouldFoldConstantShiftPairToMask( | |||
| 6138 | const SDNode *N, CombineLevel Level) const { | |||
| 6139 | assert(((N->getOpcode() == ISD::SHL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode () == ISD::SRL && N->getOperand(0).getOpcode() == ISD ::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__ __PRETTY_FUNCTION__)) | |||
| 6140 | N->getOperand(0).getOpcode() == ISD::SRL) ||(static_cast <bool> (((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode () == ISD::SRL && N->getOperand(0).getOpcode() == ISD ::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__ __PRETTY_FUNCTION__)) | |||
| 6141 | (N->getOpcode() == ISD::SRL &&(static_cast <bool> (((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode () == ISD::SRL && N->getOperand(0).getOpcode() == ISD ::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__ __PRETTY_FUNCTION__)) | |||
| 6142 | N->getOperand(0).getOpcode() == ISD::SHL)) &&(static_cast <bool> (((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode () == ISD::SRL && N->getOperand(0).getOpcode() == ISD ::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__ __PRETTY_FUNCTION__)) | |||
| 6143 | "Expected shift-shift mask")(static_cast <bool> (((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode () == ISD::SRL && N->getOperand(0).getOpcode() == ISD ::SHL)) && "Expected shift-shift mask") ? void (0) : __assert_fail ("((N->getOpcode() == ISD::SHL && N->getOperand(0).getOpcode() == ISD::SRL) || (N->getOpcode() == ISD::SRL && N->getOperand(0).getOpcode() == ISD::SHL)) && \"Expected shift-shift mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6143, __extension__ __PRETTY_FUNCTION__)); | |||
| 6144 | // TODO: Should we always create i64 masks? Or only folded immediates? | |||
| 6145 | EVT VT = N->getValueType(0); | |||
| 6146 | if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) || | |||
| 6147 | (Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) { | |||
| 6148 | // Only fold if the shift values are equal - so it folds to AND. | |||
| 6149 | // TODO - we should fold if either is a non-uniform vector but we don't do | |||
| 6150 | // the fold for non-splats yet. | |||
| 6151 | return N->getOperand(1) == N->getOperand(0).getOperand(1); | |||
| 6152 | } | |||
| 6153 | return TargetLoweringBase::shouldFoldConstantShiftPairToMask(N, Level); | |||
| 6154 | } | |||
| 6155 | ||||
| 6156 | bool X86TargetLowering::shouldFoldMaskToVariableShiftPair(SDValue Y) const { | |||
| 6157 | EVT VT = Y.getValueType(); | |||
| 6158 | ||||
| 6159 | // For vectors, we don't have a preference, but we probably want a mask. | |||
| 6160 | if (VT.isVector()) | |||
| 6161 | return false; | |||
| 6162 | ||||
| 6163 | // 64-bit shifts on 32-bit targets produce really bad bloated code. | |||
| 6164 | if (VT == MVT::i64 && !Subtarget.is64Bit()) | |||
| 6165 | return false; | |||
| 6166 | ||||
| 6167 | return true; | |||
| 6168 | } | |||
| 6169 | ||||
| 6170 | TargetLowering::ShiftLegalizationStrategy | |||
| 6171 | X86TargetLowering::preferredShiftLegalizationStrategy( | |||
| 6172 | SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const { | |||
| 6173 | if (DAG.getMachineFunction().getFunction().hasMinSize() && | |||
| 6174 | !Subtarget.isOSWindows()) | |||
| 6175 | return ShiftLegalizationStrategy::LowerToLibcall; | |||
| 6176 | return TargetLowering::preferredShiftLegalizationStrategy(DAG, N, | |||
| 6177 | ExpansionFactor); | |||
| 6178 | } | |||
| 6179 | ||||
| 6180 | bool X86TargetLowering::shouldSplatInsEltVarIndex(EVT VT) const { | |||
| 6181 | // Any legal vector type can be splatted more efficiently than | |||
| 6182 | // loading/spilling from memory. | |||
| 6183 | return isTypeLegal(VT); | |||
| 6184 | } | |||
| 6185 | ||||
| 6186 | MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const { | |||
| 6187 | MVT VT = MVT::getIntegerVT(NumBits); | |||
| 6188 | if (isTypeLegal(VT)) | |||
| 6189 | return VT; | |||
| 6190 | ||||
| 6191 | // PMOVMSKB can handle this. | |||
| 6192 | if (NumBits == 128 && isTypeLegal(MVT::v16i8)) | |||
| 6193 | return MVT::v16i8; | |||
| 6194 | ||||
| 6195 | // VPMOVMSKB can handle this. | |||
| 6196 | if (NumBits == 256 && isTypeLegal(MVT::v32i8)) | |||
| 6197 | return MVT::v32i8; | |||
| 6198 | ||||
| 6199 | // TODO: Allow 64-bit type for 32-bit target. | |||
| 6200 | // TODO: 512-bit types should be allowed, but make sure that those | |||
| 6201 | // cases are handled in combineVectorSizedSetCCEquality(). | |||
| 6202 | ||||
| 6203 | return MVT::INVALID_SIMPLE_VALUE_TYPE; | |||
| 6204 | } | |||
| 6205 | ||||
| 6206 | /// Val is the undef sentinel value or equal to the specified value. | |||
| 6207 | static bool isUndefOrEqual(int Val, int CmpVal) { | |||
| 6208 | return ((Val == SM_SentinelUndef) || (Val == CmpVal)); | |||
| 6209 | } | |||
| 6210 | ||||
| 6211 | /// Return true if every element in Mask is the undef sentinel value or equal to | |||
| 6212 | /// the specified value.. | |||
| 6213 | static bool isUndefOrEqual(ArrayRef<int> Mask, int CmpVal) { | |||
| 6214 | return llvm::all_of(Mask, [CmpVal](int M) { | |||
| 6215 | return (M == SM_SentinelUndef) || (M == CmpVal); | |||
| 6216 | }); | |||
| 6217 | } | |||
| 6218 | ||||
| 6219 | /// Val is either the undef or zero sentinel value. | |||
| 6220 | static bool isUndefOrZero(int Val) { | |||
| 6221 | return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero)); | |||
| 6222 | } | |||
| 6223 | ||||
| 6224 | /// Return true if every element in Mask, beginning from position Pos and ending | |||
| 6225 | /// in Pos+Size is the undef sentinel value. | |||
| 6226 | static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) { | |||
| 6227 | return llvm::all_of(Mask.slice(Pos, Size), | |||
| 6228 | [](int M) { return M == SM_SentinelUndef; }); | |||
| 6229 | } | |||
| 6230 | ||||
| 6231 | /// Return true if the mask creates a vector whose lower half is undefined. | |||
| 6232 | static bool isUndefLowerHalf(ArrayRef<int> Mask) { | |||
| 6233 | unsigned NumElts = Mask.size(); | |||
| 6234 | return isUndefInRange(Mask, 0, NumElts / 2); | |||
| 6235 | } | |||
| 6236 | ||||
| 6237 | /// Return true if the mask creates a vector whose upper half is undefined. | |||
| 6238 | static bool isUndefUpperHalf(ArrayRef<int> Mask) { | |||
| 6239 | unsigned NumElts = Mask.size(); | |||
| 6240 | return isUndefInRange(Mask, NumElts / 2, NumElts / 2); | |||
| 6241 | } | |||
| 6242 | ||||
| 6243 | /// Return true if Val falls within the specified range (L, H]. | |||
| 6244 | static bool isInRange(int Val, int Low, int Hi) { | |||
| 6245 | return (Val >= Low && Val < Hi); | |||
| 6246 | } | |||
| 6247 | ||||
| 6248 | /// Return true if the value of any element in Mask falls within the specified | |||
| 6249 | /// range (L, H]. | |||
| 6250 | static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) { | |||
| 6251 | return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); }); | |||
| 6252 | } | |||
| 6253 | ||||
| 6254 | /// Return true if the value of any element in Mask is the zero sentinel value. | |||
| 6255 | static bool isAnyZero(ArrayRef<int> Mask) { | |||
| 6256 | return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); | |||
| 6257 | } | |||
| 6258 | ||||
| 6259 | /// Return true if the value of any element in Mask is the zero or undef | |||
| 6260 | /// sentinel values. | |||
| 6261 | static bool isAnyZeroOrUndef(ArrayRef<int> Mask) { | |||
| 6262 | return llvm::any_of(Mask, [](int M) { | |||
| 6263 | return M == SM_SentinelZero || M == SM_SentinelUndef; | |||
| 6264 | }); | |||
| 6265 | } | |||
| 6266 | ||||
| 6267 | /// Return true if Val is undef or if its value falls within the | |||
| 6268 | /// specified range (L, H]. | |||
| 6269 | static bool isUndefOrInRange(int Val, int Low, int Hi) { | |||
| 6270 | return (Val == SM_SentinelUndef) || isInRange(Val, Low, Hi); | |||
| 6271 | } | |||
| 6272 | ||||
| 6273 | /// Return true if every element in Mask is undef or if its value | |||
| 6274 | /// falls within the specified range (L, H]. | |||
| 6275 | static bool isUndefOrInRange(ArrayRef<int> Mask, int Low, int Hi) { | |||
| 6276 | return llvm::all_of( | |||
| 6277 | Mask, [Low, Hi](int M) { return isUndefOrInRange(M, Low, Hi); }); | |||
| 6278 | } | |||
| 6279 | ||||
| 6280 | /// Return true if Val is undef, zero or if its value falls within the | |||
| 6281 | /// specified range (L, H]. | |||
| 6282 | static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) { | |||
| 6283 | return isUndefOrZero(Val) || isInRange(Val, Low, Hi); | |||
| 6284 | } | |||
| 6285 | ||||
| 6286 | /// Return true if every element in Mask is undef, zero or if its value | |||
| 6287 | /// falls within the specified range (L, H]. | |||
| 6288 | static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) { | |||
| 6289 | return llvm::all_of( | |||
| 6290 | Mask, [Low, Hi](int M) { return isUndefOrZeroOrInRange(M, Low, Hi); }); | |||
| 6291 | } | |||
| 6292 | ||||
| 6293 | /// Return true if every element in Mask, beginning | |||
| 6294 | /// from position Pos and ending in Pos + Size, falls within the specified | |||
| 6295 | /// sequence (Low, Low + Step, ..., Low + (Size - 1) * Step) or is undef. | |||
| 6296 | static bool isSequentialOrUndefInRange(ArrayRef<int> Mask, unsigned Pos, | |||
| 6297 | unsigned Size, int Low, int Step = 1) { | |||
| 6298 | for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) | |||
| 6299 | if (!isUndefOrEqual(Mask[i], Low)) | |||
| 6300 | return false; | |||
| 6301 | return true; | |||
| 6302 | } | |||
| 6303 | ||||
| 6304 | /// Return true if every element in Mask, beginning | |||
| 6305 | /// from position Pos and ending in Pos+Size, falls within the specified | |||
| 6306 | /// sequential range (Low, Low+Size], or is undef or is zero. | |||
| 6307 | static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, | |||
| 6308 | unsigned Size, int Low, | |||
| 6309 | int Step = 1) { | |||
| 6310 | for (unsigned i = Pos, e = Pos + Size; i != e; ++i, Low += Step) | |||
| 6311 | if (!isUndefOrZero(Mask[i]) && Mask[i] != Low) | |||
| 6312 | return false; | |||
| 6313 | return true; | |||
| 6314 | } | |||
| 6315 | ||||
| 6316 | /// Return true if every element in Mask, beginning | |||
| 6317 | /// from position Pos and ending in Pos+Size is undef or is zero. | |||
| 6318 | static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos, | |||
| 6319 | unsigned Size) { | |||
| 6320 | return llvm::all_of(Mask.slice(Pos, Size), isUndefOrZero); | |||
| 6321 | } | |||
| 6322 | ||||
| 6323 | /// Helper function to test whether a shuffle mask could be | |||
| 6324 | /// simplified by widening the elements being shuffled. | |||
| 6325 | /// | |||
| 6326 | /// Appends the mask for wider elements in WidenedMask if valid. Otherwise | |||
| 6327 | /// leaves it in an unspecified state. | |||
| 6328 | /// | |||
| 6329 | /// NOTE: This must handle normal vector shuffle masks and *target* vector | |||
| 6330 | /// shuffle masks. The latter have the special property of a '-2' representing | |||
| 6331 | /// a zero-ed lane of a vector. | |||
| 6332 | static bool canWidenShuffleElements(ArrayRef<int> Mask, | |||
| 6333 | SmallVectorImpl<int> &WidenedMask) { | |||
| 6334 | WidenedMask.assign(Mask.size() / 2, 0); | |||
| 6335 | for (int i = 0, Size = Mask.size(); i < Size; i += 2) { | |||
| 6336 | int M0 = Mask[i]; | |||
| 6337 | int M1 = Mask[i + 1]; | |||
| 6338 | ||||
| 6339 | // If both elements are undef, its trivial. | |||
| 6340 | if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) { | |||
| 6341 | WidenedMask[i / 2] = SM_SentinelUndef; | |||
| 6342 | continue; | |||
| 6343 | } | |||
| 6344 | ||||
| 6345 | // Check for an undef mask and a mask value properly aligned to fit with | |||
| 6346 | // a pair of values. If we find such a case, use the non-undef mask's value. | |||
| 6347 | if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) { | |||
| 6348 | WidenedMask[i / 2] = M1 / 2; | |||
| 6349 | continue; | |||
| 6350 | } | |||
| 6351 | if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) { | |||
| 6352 | WidenedMask[i / 2] = M0 / 2; | |||
| 6353 | continue; | |||
| 6354 | } | |||
| 6355 | ||||
| 6356 | // When zeroing, we need to spread the zeroing across both lanes to widen. | |||
| 6357 | if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) { | |||
| 6358 | if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) && | |||
| 6359 | (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) { | |||
| 6360 | WidenedMask[i / 2] = SM_SentinelZero; | |||
| 6361 | continue; | |||
| 6362 | } | |||
| 6363 | return false; | |||
| 6364 | } | |||
| 6365 | ||||
| 6366 | // Finally check if the two mask values are adjacent and aligned with | |||
| 6367 | // a pair. | |||
| 6368 | if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) { | |||
| 6369 | WidenedMask[i / 2] = M0 / 2; | |||
| 6370 | continue; | |||
| 6371 | } | |||
| 6372 | ||||
| 6373 | // Otherwise we can't safely widen the elements used in this shuffle. | |||
| 6374 | return false; | |||
| 6375 | } | |||
| 6376 | assert(WidenedMask.size() == Mask.size() / 2 &&(static_cast <bool> (WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!" ) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__ __PRETTY_FUNCTION__)) | |||
| 6377 | "Incorrect size of mask after widening the elements!")(static_cast <bool> (WidenedMask.size() == Mask.size() / 2 && "Incorrect size of mask after widening the elements!" ) ? void (0) : __assert_fail ("WidenedMask.size() == Mask.size() / 2 && \"Incorrect size of mask after widening the elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6377, __extension__ __PRETTY_FUNCTION__)); | |||
| 6378 | ||||
| 6379 | return true; | |||
| 6380 | } | |||
| 6381 | ||||
| 6382 | static bool canWidenShuffleElements(ArrayRef<int> Mask, | |||
| 6383 | const APInt &Zeroable, | |||
| 6384 | bool V2IsZero, | |||
| 6385 | SmallVectorImpl<int> &WidenedMask) { | |||
| 6386 | // Create an alternative mask with info about zeroable elements. | |||
| 6387 | // Here we do not set undef elements as zeroable. | |||
| 6388 | SmallVector<int, 64> ZeroableMask(Mask); | |||
| 6389 | if (V2IsZero) { | |||
| 6390 | assert(!Zeroable.isZero() && "V2's non-undef elements are used?!")(static_cast <bool> (!Zeroable.isZero() && "V2's non-undef elements are used?!" ) ? void (0) : __assert_fail ("!Zeroable.isZero() && \"V2's non-undef elements are used?!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6390, __extension__ __PRETTY_FUNCTION__)); | |||
| 6391 | for (int i = 0, Size = Mask.size(); i != Size; ++i) | |||
| 6392 | if (Mask[i] != SM_SentinelUndef && Zeroable[i]) | |||
| 6393 | ZeroableMask[i] = SM_SentinelZero; | |||
| 6394 | } | |||
| 6395 | return canWidenShuffleElements(ZeroableMask, WidenedMask); | |||
| 6396 | } | |||
| 6397 | ||||
| 6398 | static bool canWidenShuffleElements(ArrayRef<int> Mask) { | |||
| 6399 | SmallVector<int, 32> WidenedMask; | |||
| 6400 | return canWidenShuffleElements(Mask, WidenedMask); | |||
| 6401 | } | |||
| 6402 | ||||
| 6403 | // Attempt to narrow/widen shuffle mask until it matches the target number of | |||
| 6404 | // elements. | |||
| 6405 | static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts, | |||
| 6406 | SmallVectorImpl<int> &ScaledMask) { | |||
| 6407 | unsigned NumSrcElts = Mask.size(); | |||
| 6408 | assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor" ) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__ __PRETTY_FUNCTION__)) | |||
| 6409 | "Illegal shuffle scale factor")(static_cast <bool> (((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && "Illegal shuffle scale factor" ) ? void (0) : __assert_fail ("((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) && \"Illegal shuffle scale factor\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6409, __extension__ __PRETTY_FUNCTION__)); | |||
| 6410 | ||||
| 6411 | // Narrowing is guaranteed to work. | |||
| 6412 | if (NumDstElts >= NumSrcElts) { | |||
| 6413 | int Scale = NumDstElts / NumSrcElts; | |||
| 6414 | llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask); | |||
| 6415 | return true; | |||
| 6416 | } | |||
| 6417 | ||||
| 6418 | // We have to repeat the widening until we reach the target size, but we can | |||
| 6419 | // split out the first widening as it sets up ScaledMask for us. | |||
| 6420 | if (canWidenShuffleElements(Mask, ScaledMask)) { | |||
| 6421 | while (ScaledMask.size() > NumDstElts) { | |||
| 6422 | SmallVector<int, 16> WidenedMask; | |||
| 6423 | if (!canWidenShuffleElements(ScaledMask, WidenedMask)) | |||
| 6424 | return false; | |||
| 6425 | ScaledMask = std::move(WidenedMask); | |||
| 6426 | } | |||
| 6427 | return true; | |||
| 6428 | } | |||
| 6429 | ||||
| 6430 | return false; | |||
| 6431 | } | |||
| 6432 | ||||
| 6433 | /// Returns true if Elt is a constant zero or a floating point constant +0.0. | |||
| 6434 | bool X86::isZeroNode(SDValue Elt) { | |||
| 6435 | return isNullConstant(Elt) || isNullFPConstant(Elt); | |||
| 6436 | } | |||
| 6437 | ||||
| 6438 | // Build a vector of constants. | |||
| 6439 | // Use an UNDEF node if MaskElt == -1. | |||
| 6440 | // Split 64-bit constants in the 32-bit mode. | |||
| 6441 | static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG, | |||
| 6442 | const SDLoc &dl, bool IsMask = false) { | |||
| 6443 | ||||
| 6444 | SmallVector<SDValue, 32> Ops; | |||
| 6445 | bool Split = false; | |||
| 6446 | ||||
| 6447 | MVT ConstVecVT = VT; | |||
| 6448 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 6449 | bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); | |||
| 6450 | if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { | |||
| 6451 | ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); | |||
| 6452 | Split = true; | |||
| 6453 | } | |||
| 6454 | ||||
| 6455 | MVT EltVT = ConstVecVT.getVectorElementType(); | |||
| 6456 | for (unsigned i = 0; i < NumElts; ++i) { | |||
| 6457 | bool IsUndef = Values[i] < 0 && IsMask; | |||
| 6458 | SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) : | |||
| 6459 | DAG.getConstant(Values[i], dl, EltVT); | |||
| 6460 | Ops.push_back(OpNode); | |||
| 6461 | if (Split) | |||
| 6462 | Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) : | |||
| 6463 | DAG.getConstant(0, dl, EltVT)); | |||
| 6464 | } | |||
| 6465 | SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); | |||
| 6466 | if (Split) | |||
| 6467 | ConstsNode = DAG.getBitcast(VT, ConstsNode); | |||
| 6468 | return ConstsNode; | |||
| 6469 | } | |||
| 6470 | ||||
| 6471 | static SDValue getConstVector(ArrayRef<APInt> Bits, const APInt &Undefs, | |||
| 6472 | MVT VT, SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6473 | assert(Bits.size() == Undefs.getBitWidth() &&(static_cast <bool> (Bits.size() == Undefs.getBitWidth( ) && "Unequal constant and undef arrays") ? void (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__ __PRETTY_FUNCTION__)) | |||
| 6474 | "Unequal constant and undef arrays")(static_cast <bool> (Bits.size() == Undefs.getBitWidth( ) && "Unequal constant and undef arrays") ? void (0) : __assert_fail ("Bits.size() == Undefs.getBitWidth() && \"Unequal constant and undef arrays\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6474, __extension__ __PRETTY_FUNCTION__)); | |||
| 6475 | SmallVector<SDValue, 32> Ops; | |||
| 6476 | bool Split = false; | |||
| 6477 | ||||
| 6478 | MVT ConstVecVT = VT; | |||
| 6479 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 6480 | bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64); | |||
| 6481 | if (!In64BitMode && VT.getVectorElementType() == MVT::i64) { | |||
| 6482 | ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2); | |||
| 6483 | Split = true; | |||
| 6484 | } | |||
| 6485 | ||||
| 6486 | MVT EltVT = ConstVecVT.getVectorElementType(); | |||
| 6487 | for (unsigned i = 0, e = Bits.size(); i != e; ++i) { | |||
| 6488 | if (Undefs[i]) { | |||
| 6489 | Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT)); | |||
| 6490 | continue; | |||
| 6491 | } | |||
| 6492 | const APInt &V = Bits[i]; | |||
| 6493 | assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes")(static_cast <bool> (V.getBitWidth() == VT.getScalarSizeInBits () && "Unexpected sizes") ? void (0) : __assert_fail ( "V.getBitWidth() == VT.getScalarSizeInBits() && \"Unexpected sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6493, __extension__ __PRETTY_FUNCTION__)); | |||
| 6494 | if (Split) { | |||
| 6495 | Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT)); | |||
| 6496 | Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT)); | |||
| 6497 | } else if (EltVT == MVT::f32) { | |||
| 6498 | APFloat FV(APFloat::IEEEsingle(), V); | |||
| 6499 | Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); | |||
| 6500 | } else if (EltVT == MVT::f64) { | |||
| 6501 | APFloat FV(APFloat::IEEEdouble(), V); | |||
| 6502 | Ops.push_back(DAG.getConstantFP(FV, dl, EltVT)); | |||
| 6503 | } else { | |||
| 6504 | Ops.push_back(DAG.getConstant(V, dl, EltVT)); | |||
| 6505 | } | |||
| 6506 | } | |||
| 6507 | ||||
| 6508 | SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops); | |||
| 6509 | return DAG.getBitcast(VT, ConstsNode); | |||
| 6510 | } | |||
| 6511 | ||||
| 6512 | static SDValue getConstVector(ArrayRef<APInt> Bits, MVT VT, | |||
| 6513 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6514 | APInt Undefs = APInt::getZero(Bits.size()); | |||
| 6515 | return getConstVector(Bits, Undefs, VT, DAG, dl); | |||
| 6516 | } | |||
| 6517 | ||||
| 6518 | /// Returns a vector of specified type with all zero elements. | |||
| 6519 | static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget, | |||
| 6520 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6521 | assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector() || VT.getVectorElementType() == MVT ::i1) && "Unexpected vector type") ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__ __PRETTY_FUNCTION__)) | |||
| 6522 | VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector() || VT.getVectorElementType() == MVT ::i1) && "Unexpected vector type") ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__ __PRETTY_FUNCTION__)) | |||
| 6523 | "Unexpected vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector() || VT.getVectorElementType() == MVT ::i1) && "Unexpected vector type") ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() || VT.getVectorElementType() == MVT::i1) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6523, __extension__ __PRETTY_FUNCTION__)); | |||
| 6524 | ||||
| 6525 | // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest | |||
| 6526 | // type. This ensures they get CSE'd. But if the integer type is not | |||
| 6527 | // available, use a floating-point +0.0 instead. | |||
| 6528 | SDValue Vec; | |||
| 6529 | if (!Subtarget.hasSSE2() && VT.is128BitVector()) { | |||
| 6530 | Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32); | |||
| 6531 | } else if (VT.isFloatingPoint()) { | |||
| 6532 | Vec = DAG.getConstantFP(+0.0, dl, VT); | |||
| 6533 | } else if (VT.getVectorElementType() == MVT::i1) { | |||
| 6534 | assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements () <= 16) && "Unexpected vector type") ? void (0) : __assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__ __PRETTY_FUNCTION__)) | |||
| 6535 | "Unexpected vector type")(static_cast <bool> ((Subtarget.hasBWI() || VT.getVectorNumElements () <= 16) && "Unexpected vector type") ? void (0) : __assert_fail ("(Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6535, __extension__ __PRETTY_FUNCTION__)); | |||
| 6536 | Vec = DAG.getConstant(0, dl, VT); | |||
| 6537 | } else { | |||
| 6538 | unsigned Num32BitElts = VT.getSizeInBits() / 32; | |||
| 6539 | Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts)); | |||
| 6540 | } | |||
| 6541 | return DAG.getBitcast(VT, Vec); | |||
| 6542 | } | |||
| 6543 | ||||
| 6544 | // Helper to determine if the ops are all the extracted subvectors come from a | |||
| 6545 | // single source. If we allow commute they don't have to be in order (Lo/Hi). | |||
| 6546 | static SDValue getSplitVectorSrc(SDValue LHS, SDValue RHS, bool AllowCommute) { | |||
| 6547 | if (LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 6548 | RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 6549 | LHS.getValueType() != RHS.getValueType() || | |||
| 6550 | LHS.getOperand(0) != RHS.getOperand(0)) | |||
| 6551 | return SDValue(); | |||
| 6552 | ||||
| 6553 | SDValue Src = LHS.getOperand(0); | |||
| 6554 | if (Src.getValueSizeInBits() != (LHS.getValueSizeInBits() * 2)) | |||
| 6555 | return SDValue(); | |||
| 6556 | ||||
| 6557 | unsigned NumElts = LHS.getValueType().getVectorNumElements(); | |||
| 6558 | if ((LHS.getConstantOperandAPInt(1) == 0 && | |||
| 6559 | RHS.getConstantOperandAPInt(1) == NumElts) || | |||
| 6560 | (AllowCommute && RHS.getConstantOperandAPInt(1) == 0 && | |||
| 6561 | LHS.getConstantOperandAPInt(1) == NumElts)) | |||
| 6562 | return Src; | |||
| 6563 | ||||
| 6564 | return SDValue(); | |||
| 6565 | } | |||
| 6566 | ||||
| 6567 | static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, | |||
| 6568 | const SDLoc &dl, unsigned vectorWidth) { | |||
| 6569 | EVT VT = Vec.getValueType(); | |||
| 6570 | EVT ElVT = VT.getVectorElementType(); | |||
| 6571 | unsigned Factor = VT.getSizeInBits() / vectorWidth; | |||
| 6572 | EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, | |||
| 6573 | VT.getVectorNumElements() / Factor); | |||
| 6574 | ||||
| 6575 | // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR | |||
| 6576 | unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); | |||
| 6577 | assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2") ? void (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6577, __extension__ __PRETTY_FUNCTION__)); | |||
| 6578 | ||||
| 6579 | // This is the index of the first element of the vectorWidth-bit chunk | |||
| 6580 | // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. | |||
| 6581 | IdxVal &= ~(ElemsPerChunk - 1); | |||
| 6582 | ||||
| 6583 | // If the input is a buildvector just emit a smaller one. | |||
| 6584 | if (Vec.getOpcode() == ISD::BUILD_VECTOR) | |||
| 6585 | return DAG.getBuildVector(ResultVT, dl, | |||
| 6586 | Vec->ops().slice(IdxVal, ElemsPerChunk)); | |||
| 6587 | ||||
| 6588 | SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); | |||
| 6589 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); | |||
| 6590 | } | |||
| 6591 | ||||
| 6592 | /// Generate a DAG to grab 128-bits from a vector > 128 bits. This | |||
| 6593 | /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 | |||
| 6594 | /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 | |||
| 6595 | /// instructions or a simple subregister reference. Idx is an index in the | |||
| 6596 | /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes | |||
| 6597 | /// lowering EXTRACT_VECTOR_ELT operations easier. | |||
| 6598 | static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal, | |||
| 6599 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6600 | assert((Vec.getValueType().is256BitVector() ||(static_cast <bool> ((Vec.getValueType().is256BitVector () || Vec.getValueType().is512BitVector()) && "Unexpected vector size!" ) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__ __PRETTY_FUNCTION__)) | |||
| 6601 | Vec.getValueType().is512BitVector()) && "Unexpected vector size!")(static_cast <bool> ((Vec.getValueType().is256BitVector () || Vec.getValueType().is512BitVector()) && "Unexpected vector size!" ) ? void (0) : __assert_fail ("(Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && \"Unexpected vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6601, __extension__ __PRETTY_FUNCTION__)); | |||
| 6602 | return extractSubVector(Vec, IdxVal, DAG, dl, 128); | |||
| 6603 | } | |||
| 6604 | ||||
| 6605 | /// Generate a DAG to grab 256-bits from a 512-bit vector. | |||
| 6606 | static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal, | |||
| 6607 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6608 | assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is512BitVector( ) && "Unexpected vector size!") ? void (0) : __assert_fail ("Vec.getValueType().is512BitVector() && \"Unexpected vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6608, __extension__ __PRETTY_FUNCTION__)); | |||
| 6609 | return extractSubVector(Vec, IdxVal, DAG, dl, 256); | |||
| 6610 | } | |||
| 6611 | ||||
| 6612 | static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, | |||
| 6613 | SelectionDAG &DAG, const SDLoc &dl, | |||
| 6614 | unsigned vectorWidth) { | |||
| 6615 | assert((vectorWidth == 128 || vectorWidth == 256) &&(static_cast <bool> ((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width") ? void (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__ __PRETTY_FUNCTION__)) | |||
| 6616 | "Unsupported vector width")(static_cast <bool> ((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width") ? void (0) : __assert_fail ("(vectorWidth == 128 || vectorWidth == 256) && \"Unsupported vector width\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6616, __extension__ __PRETTY_FUNCTION__)); | |||
| 6617 | // Inserting UNDEF is Result | |||
| 6618 | if (Vec.isUndef()) | |||
| 6619 | return Result; | |||
| 6620 | EVT VT = Vec.getValueType(); | |||
| 6621 | EVT ElVT = VT.getVectorElementType(); | |||
| 6622 | EVT ResultVT = Result.getValueType(); | |||
| 6623 | ||||
| 6624 | // Insert the relevant vectorWidth bits. | |||
| 6625 | unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); | |||
| 6626 | assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2") ? void (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6626, __extension__ __PRETTY_FUNCTION__)); | |||
| 6627 | ||||
| 6628 | // This is the index of the first element of the vectorWidth-bit chunk | |||
| 6629 | // we want. Since ElemsPerChunk is a power of 2 just need to clear bits. | |||
| 6630 | IdxVal &= ~(ElemsPerChunk - 1); | |||
| 6631 | ||||
| 6632 | SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); | |||
| 6633 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); | |||
| 6634 | } | |||
| 6635 | ||||
| 6636 | /// Generate a DAG to put 128-bits into a vector > 128 bits. This | |||
| 6637 | /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or | |||
| 6638 | /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a | |||
| 6639 | /// simple superregister reference. Idx is an index in the 128 bits | |||
| 6640 | /// we want. It need not be aligned to a 128-bit boundary. That makes | |||
| 6641 | /// lowering INSERT_VECTOR_ELT operations easier. | |||
| 6642 | static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, | |||
| 6643 | SelectionDAG &DAG, const SDLoc &dl) { | |||
| 6644 | assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!")(static_cast <bool> (Vec.getValueType().is128BitVector( ) && "Unexpected vector size!") ? void (0) : __assert_fail ("Vec.getValueType().is128BitVector() && \"Unexpected vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6644, __extension__ __PRETTY_FUNCTION__)); | |||
| 6645 | return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128); | |||
| 6646 | } | |||
| 6647 | ||||
| 6648 | /// Widen a vector to a larger size with the same scalar type, with the new | |||
| 6649 | /// elements either zero or undef. | |||
| 6650 | static SDValue widenSubVector(MVT VT, SDValue Vec, bool ZeroNewElements, | |||
| 6651 | const X86Subtarget &Subtarget, SelectionDAG &DAG, | |||
| 6652 | const SDLoc &dl) { | |||
| 6653 | assert(Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue () < VT.getFixedSizeInBits() && Vec.getValueType() .getScalarType() == VT.getScalarType() && "Unsupported vector widening type" ) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__ __PRETTY_FUNCTION__)) | |||
| 6654 | Vec.getValueType().getScalarType() == VT.getScalarType() &&(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue () < VT.getFixedSizeInBits() && Vec.getValueType() .getScalarType() == VT.getScalarType() && "Unsupported vector widening type" ) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__ __PRETTY_FUNCTION__)) | |||
| 6655 | "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits().getFixedValue () < VT.getFixedSizeInBits() && Vec.getValueType() .getScalarType() == VT.getScalarType() && "Unsupported vector widening type" ) ? void (0) : __assert_fail ("Vec.getValueSizeInBits().getFixedValue() < VT.getFixedSizeInBits() && Vec.getValueType().getScalarType() == VT.getScalarType() && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6655, __extension__ __PRETTY_FUNCTION__)); | |||
| 6656 | SDValue Res = ZeroNewElements ? getZeroVector(VT, Subtarget, DAG, dl) | |||
| 6657 | : DAG.getUNDEF(VT); | |||
| 6658 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, Vec, | |||
| 6659 | DAG.getIntPtrConstant(0, dl)); | |||
| 6660 | } | |||
| 6661 | ||||
| 6662 | /// Widen a vector to a larger size with the same scalar type, with the new | |||
| 6663 | /// elements either zero or undef. | |||
| 6664 | static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements, | |||
| 6665 | const X86Subtarget &Subtarget, SelectionDAG &DAG, | |||
| 6666 | const SDLoc &dl, unsigned WideSizeInBits) { | |||
| 6667 | assert(Vec.getValueSizeInBits() < WideSizeInBits &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type") ? void ( 0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__ __PRETTY_FUNCTION__)) | |||
| 6668 | (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type") ? void ( 0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__ __PRETTY_FUNCTION__)) | |||
| 6669 | "Unsupported vector widening type")(static_cast <bool> (Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && "Unsupported vector widening type") ? void ( 0) : __assert_fail ("Vec.getValueSizeInBits() < WideSizeInBits && (WideSizeInBits % Vec.getScalarValueSizeInBits()) == 0 && \"Unsupported vector widening type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6669, __extension__ __PRETTY_FUNCTION__)); | |||
| 6670 | unsigned WideNumElts = WideSizeInBits / Vec.getScalarValueSizeInBits(); | |||
| 6671 | MVT SVT = Vec.getSimpleValueType().getScalarType(); | |||
| 6672 | MVT VT = MVT::getVectorVT(SVT, WideNumElts); | |||
| 6673 | return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl); | |||
| 6674 | } | |||
| 6675 | ||||
| 6676 | // Helper function to collect subvector ops that are concatenated together, | |||
| 6677 | // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series. | |||
| 6678 | // The subvectors in Ops are guaranteed to be the same type. | |||
| 6679 | static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops, | |||
| 6680 | SelectionDAG &DAG) { | |||
| 6681 | assert(Ops.empty() && "Expected an empty ops vector")(static_cast <bool> (Ops.empty() && "Expected an empty ops vector" ) ? void (0) : __assert_fail ("Ops.empty() && \"Expected an empty ops vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6681, __extension__ __PRETTY_FUNCTION__)); | |||
| 6682 | ||||
| 6683 | if (N->getOpcode() == ISD::CONCAT_VECTORS) { | |||
| 6684 | Ops.append(N->op_begin(), N->op_end()); | |||
| 6685 | return true; | |||
| 6686 | } | |||
| 6687 | ||||
| 6688 | if (N->getOpcode() == ISD::INSERT_SUBVECTOR) { | |||
| 6689 | SDValue Src = N->getOperand(0); | |||
| 6690 | SDValue Sub = N->getOperand(1); | |||
| 6691 | const APInt &Idx = N->getConstantOperandAPInt(2); | |||
| 6692 | EVT VT = Src.getValueType(); | |||
| 6693 | EVT SubVT = Sub.getValueType(); | |||
| 6694 | ||||
| 6695 | // TODO - Handle more general insert_subvector chains. | |||
| 6696 | if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2)) { | |||
| 6697 | // insert_subvector(undef, x, lo) | |||
| 6698 | if (Idx == 0 && Src.isUndef()) { | |||
| 6699 | Ops.push_back(Sub); | |||
| 6700 | Ops.push_back(DAG.getUNDEF(SubVT)); | |||
| 6701 | return true; | |||
| 6702 | } | |||
| 6703 | if (Idx == (VT.getVectorNumElements() / 2)) { | |||
| 6704 | // insert_subvector(insert_subvector(undef, x, lo), y, hi) | |||
| 6705 | if (Src.getOpcode() == ISD::INSERT_SUBVECTOR && | |||
| 6706 | Src.getOperand(1).getValueType() == SubVT && | |||
| 6707 | isNullConstant(Src.getOperand(2))) { | |||
| 6708 | Ops.push_back(Src.getOperand(1)); | |||
| 6709 | Ops.push_back(Sub); | |||
| 6710 | return true; | |||
| 6711 | } | |||
| 6712 | // insert_subvector(x, extract_subvector(x, lo), hi) | |||
| 6713 | if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 6714 | Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) { | |||
| 6715 | Ops.append(2, Sub); | |||
| 6716 | return true; | |||
| 6717 | } | |||
| 6718 | // insert_subvector(undef, x, hi) | |||
| 6719 | if (Src.isUndef()) { | |||
| 6720 | Ops.push_back(DAG.getUNDEF(SubVT)); | |||
| 6721 | Ops.push_back(Sub); | |||
| 6722 | return true; | |||
| 6723 | } | |||
| 6724 | } | |||
| 6725 | } | |||
| 6726 | } | |||
| 6727 | ||||
| 6728 | return false; | |||
| 6729 | } | |||
| 6730 | ||||
| 6731 | static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG, | |||
| 6732 | const SDLoc &dl) { | |||
| 6733 | EVT VT = Op.getValueType(); | |||
| 6734 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 6735 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 6736 | assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector") ? void ( 0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__ __PRETTY_FUNCTION__)) | |||
| 6737 | "Can't split odd sized vector")(static_cast <bool> ((NumElems % 2) == 0 && (SizeInBits % 2) == 0 && "Can't split odd sized vector") ? void ( 0) : __assert_fail ("(NumElems % 2) == 0 && (SizeInBits % 2) == 0 && \"Can't split odd sized vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6737, __extension__ __PRETTY_FUNCTION__)); | |||
| 6738 | ||||
| 6739 | // If this is a splat value (with no-undefs) then use the lower subvector, | |||
| 6740 | // which should be a free extraction. | |||
| 6741 | SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2); | |||
| 6742 | if (DAG.isSplatValue(Op, /*AllowUndefs*/ false)) | |||
| 6743 | return std::make_pair(Lo, Lo); | |||
| 6744 | ||||
| 6745 | SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2); | |||
| 6746 | return std::make_pair(Lo, Hi); | |||
| 6747 | } | |||
| 6748 | ||||
| 6749 | /// Break an operation into 2 half sized ops and then concatenate the results. | |||
| 6750 | static SDValue splitVectorOp(SDValue Op, SelectionDAG &DAG) { | |||
| 6751 | unsigned NumOps = Op.getNumOperands(); | |||
| 6752 | EVT VT = Op.getValueType(); | |||
| 6753 | SDLoc dl(Op); | |||
| 6754 | ||||
| 6755 | // Extract the LHS Lo/Hi vectors | |||
| 6756 | SmallVector<SDValue> LoOps(NumOps, SDValue()); | |||
| 6757 | SmallVector<SDValue> HiOps(NumOps, SDValue()); | |||
| 6758 | for (unsigned I = 0; I != NumOps; ++I) { | |||
| 6759 | SDValue SrcOp = Op.getOperand(I); | |||
| 6760 | if (!SrcOp.getValueType().isVector()) { | |||
| 6761 | LoOps[I] = HiOps[I] = SrcOp; | |||
| 6762 | continue; | |||
| 6763 | } | |||
| 6764 | std::tie(LoOps[I], HiOps[I]) = splitVector(SrcOp, DAG, dl); | |||
| 6765 | } | |||
| 6766 | ||||
| 6767 | EVT LoVT, HiVT; | |||
| 6768 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 6769 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, | |||
| 6770 | DAG.getNode(Op.getOpcode(), dl, LoVT, LoOps), | |||
| 6771 | DAG.getNode(Op.getOpcode(), dl, HiVT, HiOps)); | |||
| 6772 | } | |||
| 6773 | ||||
| 6774 | /// Break an unary integer operation into 2 half sized ops and then | |||
| 6775 | /// concatenate the result back. | |||
| 6776 | static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) { | |||
| 6777 | // Make sure we only try to split 256/512-bit types to avoid creating | |||
| 6778 | // narrow vectors. | |||
| 6779 | EVT VT = Op.getValueType(); | |||
| 6780 | (void)VT; | |||
| 6781 | assert((Op.getOperand(0).getValueType().is256BitVector() ||(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector () || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!" ) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__ __PRETTY_FUNCTION__)) | |||
| 6782 | Op.getOperand(0).getValueType().is512BitVector()) &&(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector () || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!" ) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__ __PRETTY_FUNCTION__)) | |||
| 6783 | (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((Op.getOperand(0).getValueType().is256BitVector () || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!" ) ? void (0) : __assert_fail ("(Op.getOperand(0).getValueType().is256BitVector() || Op.getOperand(0).getValueType().is512BitVector()) && (VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6783, __extension__ __PRETTY_FUNCTION__)); | |||
| 6784 | assert(Op.getOperand(0).getValueType().getVectorNumElements() ==(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements () == VT.getVectorNumElements() && "Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__ __PRETTY_FUNCTION__)) | |||
| 6785 | VT.getVectorNumElements() &&(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements () == VT.getVectorNumElements() && "Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__ __PRETTY_FUNCTION__)) | |||
| 6786 | "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType().getVectorNumElements () == VT.getVectorNumElements() && "Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType().getVectorNumElements() == VT.getVectorNumElements() && \"Unexpected VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6786, __extension__ __PRETTY_FUNCTION__)); | |||
| 6787 | return splitVectorOp(Op, DAG); | |||
| 6788 | } | |||
| 6789 | ||||
| 6790 | /// Break a binary integer operation into 2 half sized ops and then | |||
| 6791 | /// concatenate the result back. | |||
| 6792 | static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) { | |||
| 6793 | // Assert that all the types match. | |||
| 6794 | EVT VT = Op.getValueType(); | |||
| 6795 | (void)VT; | |||
| 6796 | assert(Op.getOperand(0).getValueType() == VT &&(static_cast <bool> (Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && "Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__ __PRETTY_FUNCTION__)) | |||
| 6797 | Op.getOperand(1).getValueType() == VT && "Unexpected VTs!")(static_cast <bool> (Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && "Unexpected VTs!") ? void (0) : __assert_fail ("Op.getOperand(0).getValueType() == VT && Op.getOperand(1).getValueType() == VT && \"Unexpected VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6797, __extension__ __PRETTY_FUNCTION__)); | |||
| 6798 | assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector ()) && "Unsupported VT!") ? void (0) : __assert_fail ( "(VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6798, __extension__ __PRETTY_FUNCTION__)); | |||
| 6799 | return splitVectorOp(Op, DAG); | |||
| 6800 | } | |||
| 6801 | ||||
| 6802 | // Helper for splitting operands of an operation to legal target size and | |||
| 6803 | // apply a function on each part. | |||
| 6804 | // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in | |||
| 6805 | // 256-bit and on AVX512BW in 512-bit. The argument VT is the type used for | |||
| 6806 | // deciding if/how to split Ops. Ops elements do *not* have to be of type VT. | |||
| 6807 | // The argument Builder is a function that will be applied on each split part: | |||
| 6808 | // SDValue Builder(SelectionDAG&G, SDLoc, ArrayRef<SDValue>) | |||
| 6809 | template <typename F> | |||
| 6810 | SDValue SplitOpsAndApply(SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 6811 | const SDLoc &DL, EVT VT, ArrayRef<SDValue> Ops, | |||
| 6812 | F Builder, bool CheckBWI = true) { | |||
| 6813 | assert(Subtarget.hasSSE2() && "Target assumed to support at least SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Target assumed to support at least SSE2" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Target assumed to support at least SSE2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6813, __extension__ __PRETTY_FUNCTION__)); | |||
| 6814 | unsigned NumSubs = 1; | |||
| 6815 | if ((CheckBWI && Subtarget.useBWIRegs()) || | |||
| 6816 | (!CheckBWI && Subtarget.useAVX512Regs())) { | |||
| 6817 | if (VT.getSizeInBits() > 512) { | |||
| 6818 | NumSubs = VT.getSizeInBits() / 512; | |||
| 6819 | assert((VT.getSizeInBits() % 512) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 512) == 0 && "Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 512) == 0 && \"Illegal vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6819, __extension__ __PRETTY_FUNCTION__)); | |||
| 6820 | } | |||
| 6821 | } else if (Subtarget.hasAVX2()) { | |||
| 6822 | if (VT.getSizeInBits() > 256) { | |||
| 6823 | NumSubs = VT.getSizeInBits() / 256; | |||
| 6824 | assert((VT.getSizeInBits() % 256) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 256) == 0 && "Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 256) == 0 && \"Illegal vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6824, __extension__ __PRETTY_FUNCTION__)); | |||
| 6825 | } | |||
| 6826 | } else { | |||
| 6827 | if (VT.getSizeInBits() > 128) { | |||
| 6828 | NumSubs = VT.getSizeInBits() / 128; | |||
| 6829 | assert((VT.getSizeInBits() % 128) == 0 && "Illegal vector size")(static_cast <bool> ((VT.getSizeInBits() % 128) == 0 && "Illegal vector size") ? void (0) : __assert_fail ("(VT.getSizeInBits() % 128) == 0 && \"Illegal vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6829, __extension__ __PRETTY_FUNCTION__)); | |||
| 6830 | } | |||
| 6831 | } | |||
| 6832 | ||||
| 6833 | if (NumSubs == 1) | |||
| 6834 | return Builder(DAG, DL, Ops); | |||
| 6835 | ||||
| 6836 | SmallVector<SDValue, 4> Subs; | |||
| 6837 | for (unsigned i = 0; i != NumSubs; ++i) { | |||
| 6838 | SmallVector<SDValue, 2> SubOps; | |||
| 6839 | for (SDValue Op : Ops) { | |||
| 6840 | EVT OpVT = Op.getValueType(); | |||
| 6841 | unsigned NumSubElts = OpVT.getVectorNumElements() / NumSubs; | |||
| 6842 | unsigned SizeSub = OpVT.getSizeInBits() / NumSubs; | |||
| 6843 | SubOps.push_back(extractSubVector(Op, i * NumSubElts, DAG, DL, SizeSub)); | |||
| 6844 | } | |||
| 6845 | Subs.push_back(Builder(DAG, DL, SubOps)); | |||
| 6846 | } | |||
| 6847 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); | |||
| 6848 | } | |||
| 6849 | ||||
| 6850 | // Helper function that extends a non-512-bit vector op to 512-bits on non-VLX | |||
| 6851 | // targets. | |||
| 6852 | static SDValue getAVX512Node(unsigned Opcode, const SDLoc &DL, MVT VT, | |||
| 6853 | ArrayRef<SDValue> Ops, SelectionDAG &DAG, | |||
| 6854 | const X86Subtarget &Subtarget) { | |||
| 6855 | assert(Subtarget.hasAVX512() && "AVX512 target expected")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 target expected" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 target expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6855, __extension__ __PRETTY_FUNCTION__)); | |||
| 6856 | MVT SVT = VT.getScalarType(); | |||
| 6857 | ||||
| 6858 | // If we have a 32/64 splatted constant, splat it to DstTy to | |||
| 6859 | // encourage a foldable broadcast'd operand. | |||
| 6860 | auto MakeBroadcastOp = [&](SDValue Op, MVT OpVT, MVT DstVT) { | |||
| 6861 | unsigned OpEltSizeInBits = OpVT.getScalarSizeInBits(); | |||
| 6862 | // AVX512 broadcasts 32/64-bit operands. | |||
| 6863 | // TODO: Support float once getAVX512Node is used by fp-ops. | |||
| 6864 | if (!OpVT.isInteger() || OpEltSizeInBits < 32 || | |||
| 6865 | !DAG.getTargetLoweringInfo().isTypeLegal(SVT)) | |||
| 6866 | return SDValue(); | |||
| 6867 | // If we're not widening, don't bother if we're not bitcasting. | |||
| 6868 | if (OpVT == DstVT && Op.getOpcode() != ISD::BITCAST) | |||
| 6869 | return SDValue(); | |||
| 6870 | if (auto *BV = dyn_cast<BuildVectorSDNode>(peekThroughBitcasts(Op))) { | |||
| 6871 | APInt SplatValue, SplatUndef; | |||
| 6872 | unsigned SplatBitSize; | |||
| 6873 | bool HasAnyUndefs; | |||
| 6874 | if (BV->isConstantSplat(SplatValue, SplatUndef, SplatBitSize, | |||
| 6875 | HasAnyUndefs, OpEltSizeInBits) && | |||
| 6876 | !HasAnyUndefs && SplatValue.getBitWidth() == OpEltSizeInBits) | |||
| 6877 | return DAG.getConstant(SplatValue, DL, DstVT); | |||
| 6878 | } | |||
| 6879 | return SDValue(); | |||
| 6880 | }; | |||
| 6881 | ||||
| 6882 | bool Widen = !(Subtarget.hasVLX() || VT.is512BitVector()); | |||
| 6883 | ||||
| 6884 | MVT DstVT = VT; | |||
| 6885 | if (Widen) | |||
| 6886 | DstVT = MVT::getVectorVT(SVT, 512 / SVT.getSizeInBits()); | |||
| 6887 | ||||
| 6888 | // Canonicalize src operands. | |||
| 6889 | SmallVector<SDValue> SrcOps(Ops.begin(), Ops.end()); | |||
| 6890 | for (SDValue &Op : SrcOps) { | |||
| 6891 | MVT OpVT = Op.getSimpleValueType(); | |||
| 6892 | // Just pass through scalar operands. | |||
| 6893 | if (!OpVT.isVector()) | |||
| 6894 | continue; | |||
| 6895 | assert(OpVT == VT && "Vector type mismatch")(static_cast <bool> (OpVT == VT && "Vector type mismatch" ) ? void (0) : __assert_fail ("OpVT == VT && \"Vector type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6895, __extension__ __PRETTY_FUNCTION__)); | |||
| 6896 | ||||
| 6897 | if (SDValue BroadcastOp = MakeBroadcastOp(Op, OpVT, DstVT)) { | |||
| 6898 | Op = BroadcastOp; | |||
| 6899 | continue; | |||
| 6900 | } | |||
| 6901 | ||||
| 6902 | // Just widen the subvector by inserting into an undef wide vector. | |||
| 6903 | if (Widen) | |||
| 6904 | Op = widenSubVector(Op, false, Subtarget, DAG, DL, 512); | |||
| 6905 | } | |||
| 6906 | ||||
| 6907 | SDValue Res = DAG.getNode(Opcode, DL, DstVT, SrcOps); | |||
| 6908 | ||||
| 6909 | // Perform the 512-bit op then extract the bottom subvector. | |||
| 6910 | if (Widen) | |||
| 6911 | Res = extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); | |||
| 6912 | return Res; | |||
| 6913 | } | |||
| 6914 | ||||
| 6915 | /// Insert i1-subvector to i1-vector. | |||
| 6916 | static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, | |||
| 6917 | const X86Subtarget &Subtarget) { | |||
| 6918 | ||||
| 6919 | SDLoc dl(Op); | |||
| 6920 | SDValue Vec = Op.getOperand(0); | |||
| 6921 | SDValue SubVec = Op.getOperand(1); | |||
| 6922 | SDValue Idx = Op.getOperand(2); | |||
| 6923 | unsigned IdxVal = Op.getConstantOperandVal(2); | |||
| 6924 | ||||
| 6925 | // Inserting undef is a nop. We can just return the original vector. | |||
| 6926 | if (SubVec.isUndef()) | |||
| 6927 | return Vec; | |||
| 6928 | ||||
| 6929 | if (IdxVal == 0 && Vec.isUndef()) // the operation is legal | |||
| 6930 | return Op; | |||
| 6931 | ||||
| 6932 | MVT OpVT = Op.getSimpleValueType(); | |||
| 6933 | unsigned NumElems = OpVT.getVectorNumElements(); | |||
| 6934 | SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); | |||
| 6935 | ||||
| 6936 | // Extend to natively supported kshift. | |||
| 6937 | MVT WideOpVT = OpVT; | |||
| 6938 | if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) | |||
| 6939 | WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 6940 | ||||
| 6941 | // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts | |||
| 6942 | // if necessary. | |||
| 6943 | if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { | |||
| 6944 | // May need to promote to a legal type. | |||
| 6945 | Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, | |||
| 6946 | DAG.getConstant(0, dl, WideOpVT), | |||
| 6947 | SubVec, Idx); | |||
| 6948 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); | |||
| 6949 | } | |||
| 6950 | ||||
| 6951 | MVT SubVecVT = SubVec.getSimpleValueType(); | |||
| 6952 | unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); | |||
| 6953 | assert(IdxVal + SubVecNumElems <= NumElems &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__ __PRETTY_FUNCTION__)) | |||
| 6954 | IdxVal % SubVecVT.getSizeInBits() == 0 &&(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__ __PRETTY_FUNCTION__)) | |||
| 6955 | "Unexpected index value in INSERT_SUBVECTOR")(static_cast <bool> (IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR") ? void (0) : __assert_fail ("IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && \"Unexpected index value in INSERT_SUBVECTOR\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6955, __extension__ __PRETTY_FUNCTION__)); | |||
| 6956 | ||||
| 6957 | SDValue Undef = DAG.getUNDEF(WideOpVT); | |||
| 6958 | ||||
| 6959 | if (IdxVal == 0) { | |||
| 6960 | // Zero lower bits of the Vec | |||
| 6961 | SDValue ShiftBits = DAG.getTargetConstant(SubVecNumElems, dl, MVT::i8); | |||
| 6962 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, | |||
| 6963 | ZeroIdx); | |||
| 6964 | Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); | |||
| 6965 | Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); | |||
| 6966 | // Merge them together, SubVec should be zero extended. | |||
| 6967 | SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, | |||
| 6968 | DAG.getConstant(0, dl, WideOpVT), | |||
| 6969 | SubVec, ZeroIdx); | |||
| 6970 | Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); | |||
| 6971 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); | |||
| 6972 | } | |||
| 6973 | ||||
| 6974 | SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, | |||
| 6975 | Undef, SubVec, ZeroIdx); | |||
| 6976 | ||||
| 6977 | if (Vec.isUndef()) { | |||
| 6978 | assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index" ) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6978, __extension__ __PRETTY_FUNCTION__)); | |||
| 6979 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 6980 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 6981 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); | |||
| 6982 | } | |||
| 6983 | ||||
| 6984 | if (ISD::isBuildVectorAllZeros(Vec.getNode())) { | |||
| 6985 | assert(IdxVal != 0 && "Unexpected index")(static_cast <bool> (IdxVal != 0 && "Unexpected index" ) ? void (0) : __assert_fail ("IdxVal != 0 && \"Unexpected index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 6985, __extension__ __PRETTY_FUNCTION__)); | |||
| 6986 | // If upper elements of Vec are known undef, then just shift into place. | |||
| 6987 | if (llvm::all_of(Vec->ops().slice(IdxVal + SubVecNumElems), | |||
| 6988 | [](SDValue V) { return V.isUndef(); })) { | |||
| 6989 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 6990 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 6991 | } else { | |||
| 6992 | NumElems = WideOpVT.getVectorNumElements(); | |||
| 6993 | unsigned ShiftLeft = NumElems - SubVecNumElems; | |||
| 6994 | unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; | |||
| 6995 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 6996 | DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); | |||
| 6997 | if (ShiftRight != 0) | |||
| 6998 | SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, | |||
| 6999 | DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); | |||
| 7000 | } | |||
| 7001 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); | |||
| 7002 | } | |||
| 7003 | ||||
| 7004 | // Simple case when we put subvector in the upper part | |||
| 7005 | if (IdxVal + SubVecNumElems == NumElems) { | |||
| 7006 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 7007 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 7008 | if (SubVecNumElems * 2 == NumElems) { | |||
| 7009 | // Special case, use legal zero extending insert_subvector. This allows | |||
| 7010 | // isel to optimize when bits are known zero. | |||
| 7011 | Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); | |||
| 7012 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, | |||
| 7013 | DAG.getConstant(0, dl, WideOpVT), | |||
| 7014 | Vec, ZeroIdx); | |||
| 7015 | } else { | |||
| 7016 | // Otherwise use explicit shifts to zero the bits. | |||
| 7017 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, | |||
| 7018 | Undef, Vec, ZeroIdx); | |||
| 7019 | NumElems = WideOpVT.getVectorNumElements(); | |||
| 7020 | SDValue ShiftBits = DAG.getTargetConstant(NumElems - IdxVal, dl, MVT::i8); | |||
| 7021 | Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); | |||
| 7022 | Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); | |||
| 7023 | } | |||
| 7024 | Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); | |||
| 7025 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); | |||
| 7026 | } | |||
| 7027 | ||||
| 7028 | // Inserting into the middle is more complicated. | |||
| 7029 | ||||
| 7030 | NumElems = WideOpVT.getVectorNumElements(); | |||
| 7031 | ||||
| 7032 | // Widen the vector if needed. | |||
| 7033 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); | |||
| 7034 | ||||
| 7035 | unsigned ShiftLeft = NumElems - SubVecNumElems; | |||
| 7036 | unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; | |||
| 7037 | ||||
| 7038 | // Do an optimization for the the most frequently used types. | |||
| 7039 | if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) { | |||
| 7040 | APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems); | |||
| 7041 | Mask0.flipAllBits(); | |||
| 7042 | SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems)); | |||
| 7043 | SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0); | |||
| 7044 | Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0); | |||
| 7045 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 7046 | DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); | |||
| 7047 | SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, | |||
| 7048 | DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); | |||
| 7049 | Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); | |||
| 7050 | ||||
| 7051 | // Reduce to original width if needed. | |||
| 7052 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); | |||
| 7053 | } | |||
| 7054 | ||||
| 7055 | // Clear the upper bits of the subvector and move it to its insert position. | |||
| 7056 | SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, | |||
| 7057 | DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); | |||
| 7058 | SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, | |||
| 7059 | DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); | |||
| 7060 | ||||
| 7061 | // Isolate the bits below the insertion point. | |||
| 7062 | unsigned LowShift = NumElems - IdxVal; | |||
| 7063 | SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, | |||
| 7064 | DAG.getTargetConstant(LowShift, dl, MVT::i8)); | |||
| 7065 | Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, | |||
| 7066 | DAG.getTargetConstant(LowShift, dl, MVT::i8)); | |||
| 7067 | ||||
| 7068 | // Isolate the bits after the last inserted bit. | |||
| 7069 | unsigned HighShift = IdxVal + SubVecNumElems; | |||
| 7070 | SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, | |||
| 7071 | DAG.getTargetConstant(HighShift, dl, MVT::i8)); | |||
| 7072 | High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, | |||
| 7073 | DAG.getTargetConstant(HighShift, dl, MVT::i8)); | |||
| 7074 | ||||
| 7075 | // Now OR all 3 pieces together. | |||
| 7076 | Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); | |||
| 7077 | SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); | |||
| 7078 | ||||
| 7079 | // Reduce to original width if needed. | |||
| 7080 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); | |||
| 7081 | } | |||
| 7082 | ||||
| 7083 | static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, | |||
| 7084 | const SDLoc &dl) { | |||
| 7085 | assert(V1.getValueType() == V2.getValueType() && "subvector type mismatch")(static_cast <bool> (V1.getValueType() == V2.getValueType () && "subvector type mismatch") ? void (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"subvector type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7085, __extension__ __PRETTY_FUNCTION__)); | |||
| 7086 | EVT SubVT = V1.getValueType(); | |||
| 7087 | EVT SubSVT = SubVT.getScalarType(); | |||
| 7088 | unsigned SubNumElts = SubVT.getVectorNumElements(); | |||
| 7089 | unsigned SubVectorWidth = SubVT.getSizeInBits(); | |||
| 7090 | EVT VT = EVT::getVectorVT(*DAG.getContext(), SubSVT, 2 * SubNumElts); | |||
| 7091 | SDValue V = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, dl, SubVectorWidth); | |||
| 7092 | return insertSubVector(V, V2, SubNumElts, DAG, dl, SubVectorWidth); | |||
| 7093 | } | |||
| 7094 | ||||
| 7095 | /// Returns a vector of specified type with all bits set. | |||
| 7096 | /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>. | |||
| 7097 | /// Then bitcast to their original type, ensuring they get CSE'd. | |||
| 7098 | static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) { | |||
| 7099 | assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__ __PRETTY_FUNCTION__)) | |||
| 7100 | "Expected a 128/256/512-bit vector type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Expected a 128/256/512-bit vector type" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected a 128/256/512-bit vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7100, __extension__ __PRETTY_FUNCTION__)); | |||
| 7101 | ||||
| 7102 | APInt Ones = APInt::getAllOnes(32); | |||
| 7103 | unsigned NumElts = VT.getSizeInBits() / 32; | |||
| 7104 | SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts)); | |||
| 7105 | return DAG.getBitcast(VT, Vec); | |||
| 7106 | } | |||
| 7107 | ||||
| 7108 | static SDValue getEXTEND_VECTOR_INREG(unsigned Opcode, const SDLoc &DL, EVT VT, | |||
| 7109 | SDValue In, SelectionDAG &DAG) { | |||
| 7110 | EVT InVT = In.getValueType(); | |||
| 7111 | assert(VT.isVector() && InVT.isVector() && "Expected vector VTs.")(static_cast <bool> (VT.isVector() && InVT.isVector () && "Expected vector VTs.") ? void (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector VTs.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7111, __extension__ __PRETTY_FUNCTION__)); | |||
| 7112 | assert((ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode ||(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD:: SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__ __PRETTY_FUNCTION__)) | |||
| 7113 | ISD::ZERO_EXTEND == Opcode) &&(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD:: SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__ __PRETTY_FUNCTION__)) | |||
| 7114 | "Unknown extension opcode")(static_cast <bool> ((ISD::ANY_EXTEND == Opcode || ISD:: SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && "Unknown extension opcode") ? void (0) : __assert_fail ("(ISD::ANY_EXTEND == Opcode || ISD::SIGN_EXTEND == Opcode || ISD::ZERO_EXTEND == Opcode) && \"Unknown extension opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7114, __extension__ __PRETTY_FUNCTION__)); | |||
| 7115 | ||||
| 7116 | // For 256-bit vectors, we only need the lower (128-bit) input half. | |||
| 7117 | // For 512-bit vectors, we only need the lower input half or quarter. | |||
| 7118 | if (InVT.getSizeInBits() > 128) { | |||
| 7119 | assert(VT.getSizeInBits() == InVT.getSizeInBits() &&(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits () && "Expected VTs to be the same size!") ? void (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__ __PRETTY_FUNCTION__)) | |||
| 7120 | "Expected VTs to be the same size!")(static_cast <bool> (VT.getSizeInBits() == InVT.getSizeInBits () && "Expected VTs to be the same size!") ? void (0) : __assert_fail ("VT.getSizeInBits() == InVT.getSizeInBits() && \"Expected VTs to be the same size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7120, __extension__ __PRETTY_FUNCTION__)); | |||
| 7121 | unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits(); | |||
| 7122 | In = extractSubVector(In, 0, DAG, DL, | |||
| 7123 | std::max(128U, (unsigned)VT.getSizeInBits() / Scale)); | |||
| 7124 | InVT = In.getValueType(); | |||
| 7125 | } | |||
| 7126 | ||||
| 7127 | if (VT.getVectorNumElements() != InVT.getVectorNumElements()) | |||
| 7128 | Opcode = DAG.getOpcode_EXTEND_VECTOR_INREG(Opcode); | |||
| 7129 | ||||
| 7130 | return DAG.getNode(Opcode, DL, VT, In); | |||
| 7131 | } | |||
| 7132 | ||||
| 7133 | // Match (xor X, -1) -> X. | |||
| 7134 | // Match extract_subvector(xor X, -1) -> extract_subvector(X). | |||
| 7135 | // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y). | |||
| 7136 | static SDValue IsNOT(SDValue V, SelectionDAG &DAG) { | |||
| 7137 | V = peekThroughBitcasts(V); | |||
| 7138 | if (V.getOpcode() == ISD::XOR && | |||
| 7139 | (ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()) || | |||
| 7140 | isAllOnesConstant(V.getOperand(1)))) | |||
| 7141 | return V.getOperand(0); | |||
| 7142 | if (V.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 7143 | (isNullConstant(V.getOperand(1)) || V.getOperand(0).hasOneUse())) { | |||
| 7144 | if (SDValue Not = IsNOT(V.getOperand(0), DAG)) { | |||
| 7145 | Not = DAG.getBitcast(V.getOperand(0).getValueType(), Not); | |||
| 7146 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Not), V.getValueType(), | |||
| 7147 | Not, V.getOperand(1)); | |||
| 7148 | } | |||
| 7149 | } | |||
| 7150 | SmallVector<SDValue, 2> CatOps; | |||
| 7151 | if (collectConcatOps(V.getNode(), CatOps, DAG)) { | |||
| 7152 | for (SDValue &CatOp : CatOps) { | |||
| 7153 | SDValue NotCat = IsNOT(CatOp, DAG); | |||
| 7154 | if (!NotCat) return SDValue(); | |||
| 7155 | CatOp = DAG.getBitcast(CatOp.getValueType(), NotCat); | |||
| 7156 | } | |||
| 7157 | return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(V), V.getValueType(), CatOps); | |||
| 7158 | } | |||
| 7159 | return SDValue(); | |||
| 7160 | } | |||
| 7161 | ||||
| 7162 | void llvm::createUnpackShuffleMask(EVT VT, SmallVectorImpl<int> &Mask, | |||
| 7163 | bool Lo, bool Unary) { | |||
| 7164 | assert(VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 &&(static_cast <bool> (VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack" ) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__ __PRETTY_FUNCTION__)) | |||
| 7165 | "Illegal vector type to unpack")(static_cast <bool> (VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && "Illegal vector type to unpack" ) ? void (0) : __assert_fail ("VT.getScalarType().isSimple() && (VT.getSizeInBits() % 128) == 0 && \"Illegal vector type to unpack\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7165, __extension__ __PRETTY_FUNCTION__)); | |||
| 7166 | assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector" ) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7166, __extension__ __PRETTY_FUNCTION__)); | |||
| 7167 | int NumElts = VT.getVectorNumElements(); | |||
| 7168 | int NumEltsInLane = 128 / VT.getScalarSizeInBits(); | |||
| 7169 | for (int i = 0; i < NumElts; ++i) { | |||
| 7170 | unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; | |||
| 7171 | int Pos = (i % NumEltsInLane) / 2 + LaneStart; | |||
| 7172 | Pos += (Unary ? 0 : NumElts * (i % 2)); | |||
| 7173 | Pos += (Lo ? 0 : NumEltsInLane / 2); | |||
| 7174 | Mask.push_back(Pos); | |||
| 7175 | } | |||
| 7176 | } | |||
| 7177 | ||||
| 7178 | /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation | |||
| 7179 | /// imposed by AVX and specific to the unary pattern. Example: | |||
| 7180 | /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3> | |||
| 7181 | /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7> | |||
| 7182 | void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, | |||
| 7183 | bool Lo) { | |||
| 7184 | assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector" ) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7184, __extension__ __PRETTY_FUNCTION__)); | |||
| 7185 | int NumElts = VT.getVectorNumElements(); | |||
| 7186 | for (int i = 0; i < NumElts; ++i) { | |||
| 7187 | int Pos = i / 2; | |||
| 7188 | Pos += (Lo ? 0 : NumElts / 2); | |||
| 7189 | Mask.push_back(Pos); | |||
| 7190 | } | |||
| 7191 | } | |||
| 7192 | ||||
| 7193 | // Attempt to constant fold, else just create a VECTOR_SHUFFLE. | |||
| 7194 | static SDValue getVectorShuffle(SelectionDAG &DAG, EVT VT, const SDLoc &dl, | |||
| 7195 | SDValue V1, SDValue V2, ArrayRef<int> Mask) { | |||
| 7196 | if ((ISD::isBuildVectorOfConstantSDNodes(V1.getNode()) || V1.isUndef()) && | |||
| 7197 | (ISD::isBuildVectorOfConstantSDNodes(V2.getNode()) || V2.isUndef())) { | |||
| 7198 | SmallVector<SDValue> Ops(Mask.size(), DAG.getUNDEF(VT.getScalarType())); | |||
| 7199 | for (int I = 0, NumElts = Mask.size(); I != NumElts; ++I) { | |||
| 7200 | int M = Mask[I]; | |||
| 7201 | if (M < 0) | |||
| 7202 | continue; | |||
| 7203 | SDValue V = (M < NumElts) ? V1 : V2; | |||
| 7204 | if (V.isUndef()) | |||
| 7205 | continue; | |||
| 7206 | Ops[I] = V.getOperand(M % NumElts); | |||
| 7207 | } | |||
| 7208 | return DAG.getBuildVector(VT, dl, Ops); | |||
| 7209 | } | |||
| 7210 | ||||
| 7211 | return DAG.getVectorShuffle(VT, dl, V1, V2, Mask); | |||
| 7212 | } | |||
| 7213 | ||||
| 7214 | /// Returns a vector_shuffle node for an unpackl operation. | |||
| 7215 | static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, EVT VT, | |||
| 7216 | SDValue V1, SDValue V2) { | |||
| 7217 | SmallVector<int, 8> Mask; | |||
| 7218 | createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false); | |||
| 7219 | return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); | |||
| 7220 | } | |||
| 7221 | ||||
| 7222 | /// Returns a vector_shuffle node for an unpackh operation. | |||
| 7223 | static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, EVT VT, | |||
| 7224 | SDValue V1, SDValue V2) { | |||
| 7225 | SmallVector<int, 8> Mask; | |||
| 7226 | createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false); | |||
| 7227 | return getVectorShuffle(DAG, VT, dl, V1, V2, Mask); | |||
| 7228 | } | |||
| 7229 | ||||
| 7230 | /// Returns a node that packs the LHS + RHS nodes together at half width. | |||
| 7231 | /// May return X86ISD::PACKSS/PACKUS, packing the top/bottom half. | |||
| 7232 | /// TODO: Add subvector splitting if/when we have a need for it. | |||
| 7233 | static SDValue getPack(SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 7234 | const SDLoc &dl, MVT VT, SDValue LHS, SDValue RHS, | |||
| 7235 | bool PackHiHalf = false) { | |||
| 7236 | MVT OpVT = LHS.getSimpleValueType(); | |||
| 7237 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 7238 | bool UsePackUS = Subtarget.hasSSE41() || EltSizeInBits == 8; | |||
| 7239 | assert(OpVT == RHS.getSimpleValueType() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types" ) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__ __PRETTY_FUNCTION__)) | |||
| 7240 | VT.getSizeInBits() == OpVT.getSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types" ) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__ __PRETTY_FUNCTION__)) | |||
| 7241 | (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() &&(static_cast <bool> (OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types" ) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__ __PRETTY_FUNCTION__)) | |||
| 7242 | "Unexpected PACK operand types")(static_cast <bool> (OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && "Unexpected PACK operand types" ) ? void (0) : __assert_fail ("OpVT == RHS.getSimpleValueType() && VT.getSizeInBits() == OpVT.getSizeInBits() && (EltSizeInBits * 2) == OpVT.getScalarSizeInBits() && \"Unexpected PACK operand types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7242, __extension__ __PRETTY_FUNCTION__)); | |||
| 7243 | assert((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) &&(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && "Unexpected PACK result type" ) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__ __PRETTY_FUNCTION__)) | |||
| 7244 | "Unexpected PACK result type")(static_cast <bool> ((EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && "Unexpected PACK result type" ) ? void (0) : __assert_fail ("(EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) && \"Unexpected PACK result type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7244, __extension__ __PRETTY_FUNCTION__)); | |||
| 7245 | ||||
| 7246 | // Rely on vector shuffles for vXi64 -> vXi32 packing. | |||
| 7247 | if (EltSizeInBits == 32) { | |||
| 7248 | SmallVector<int> PackMask; | |||
| 7249 | int Offset = PackHiHalf ? 1 : 0; | |||
| 7250 | int NumElts = VT.getVectorNumElements(); | |||
| 7251 | for (int I = 0; I != NumElts; I += 4) { | |||
| 7252 | PackMask.push_back(I + Offset); | |||
| 7253 | PackMask.push_back(I + Offset + 2); | |||
| 7254 | PackMask.push_back(I + Offset + NumElts); | |||
| 7255 | PackMask.push_back(I + Offset + NumElts + 2); | |||
| 7256 | } | |||
| 7257 | return DAG.getVectorShuffle(VT, dl, DAG.getBitcast(VT, LHS), | |||
| 7258 | DAG.getBitcast(VT, RHS), PackMask); | |||
| 7259 | } | |||
| 7260 | ||||
| 7261 | // See if we already have sufficient leading bits for PACKSS/PACKUS. | |||
| 7262 | if (!PackHiHalf) { | |||
| 7263 | if (UsePackUS && | |||
| 7264 | DAG.computeKnownBits(LHS).countMaxActiveBits() <= EltSizeInBits && | |||
| 7265 | DAG.computeKnownBits(RHS).countMaxActiveBits() <= EltSizeInBits) | |||
| 7266 | return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); | |||
| 7267 | ||||
| 7268 | if (DAG.ComputeMaxSignificantBits(LHS) <= EltSizeInBits && | |||
| 7269 | DAG.ComputeMaxSignificantBits(RHS) <= EltSizeInBits) | |||
| 7270 | return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); | |||
| 7271 | } | |||
| 7272 | ||||
| 7273 | // Fallback to sign/zero extending the requested half and pack. | |||
| 7274 | SDValue Amt = DAG.getTargetConstant(EltSizeInBits, dl, MVT::i8); | |||
| 7275 | if (UsePackUS) { | |||
| 7276 | if (PackHiHalf) { | |||
| 7277 | LHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, LHS, Amt); | |||
| 7278 | RHS = DAG.getNode(X86ISD::VSRLI, dl, OpVT, RHS, Amt); | |||
| 7279 | } else { | |||
| 7280 | SDValue Mask = DAG.getConstant((1ULL << EltSizeInBits) - 1, dl, OpVT); | |||
| 7281 | LHS = DAG.getNode(ISD::AND, dl, OpVT, LHS, Mask); | |||
| 7282 | RHS = DAG.getNode(ISD::AND, dl, OpVT, RHS, Mask); | |||
| 7283 | }; | |||
| 7284 | return DAG.getNode(X86ISD::PACKUS, dl, VT, LHS, RHS); | |||
| 7285 | }; | |||
| 7286 | ||||
| 7287 | if (!PackHiHalf) { | |||
| 7288 | LHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, LHS, Amt); | |||
| 7289 | RHS = DAG.getNode(X86ISD::VSHLI, dl, OpVT, RHS, Amt); | |||
| 7290 | } | |||
| 7291 | LHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, LHS, Amt); | |||
| 7292 | RHS = DAG.getNode(X86ISD::VSRAI, dl, OpVT, RHS, Amt); | |||
| 7293 | return DAG.getNode(X86ISD::PACKSS, dl, VT, LHS, RHS); | |||
| 7294 | } | |||
| 7295 | ||||
| 7296 | /// Return a vector_shuffle of the specified vector of zero or undef vector. | |||
| 7297 | /// This produces a shuffle where the low element of V2 is swizzled into the | |||
| 7298 | /// zero/undef vector, landing at element Idx. | |||
| 7299 | /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). | |||
| 7300 | static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx, | |||
| 7301 | bool IsZero, | |||
| 7302 | const X86Subtarget &Subtarget, | |||
| 7303 | SelectionDAG &DAG) { | |||
| 7304 | MVT VT = V2.getSimpleValueType(); | |||
| 7305 | SDValue V1 = IsZero | |||
| 7306 | ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); | |||
| 7307 | int NumElems = VT.getVectorNumElements(); | |||
| 7308 | SmallVector<int, 16> MaskVec(NumElems); | |||
| 7309 | for (int i = 0; i != NumElems; ++i) | |||
| 7310 | // If this is the insertion idx, put the low elt of V2 here. | |||
| 7311 | MaskVec[i] = (i == Idx) ? NumElems : i; | |||
| 7312 | return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec); | |||
| 7313 | } | |||
| 7314 | ||||
| 7315 | static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) { | |||
| 7316 | if (Ptr.getOpcode() == X86ISD::Wrapper || | |||
| 7317 | Ptr.getOpcode() == X86ISD::WrapperRIP) | |||
| 7318 | Ptr = Ptr.getOperand(0); | |||
| 7319 | ||||
| 7320 | auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr); | |||
| 7321 | if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0) | |||
| 7322 | return nullptr; | |||
| 7323 | ||||
| 7324 | return CNode->getConstVal(); | |||
| 7325 | } | |||
| 7326 | ||||
| 7327 | static const Constant *getTargetConstantFromNode(LoadSDNode *Load) { | |||
| 7328 | if (!Load || !ISD::isNormalLoad(Load)) | |||
| 7329 | return nullptr; | |||
| 7330 | return getTargetConstantFromBasePtr(Load->getBasePtr()); | |||
| 7331 | } | |||
| 7332 | ||||
| 7333 | static const Constant *getTargetConstantFromNode(SDValue Op) { | |||
| 7334 | Op = peekThroughBitcasts(Op); | |||
| 7335 | return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)); | |||
| 7336 | } | |||
| 7337 | ||||
| 7338 | const Constant * | |||
| 7339 | X86TargetLowering::getTargetConstantFromLoad(LoadSDNode *LD) const { | |||
| 7340 | assert(LD && "Unexpected null LoadSDNode")(static_cast <bool> (LD && "Unexpected null LoadSDNode" ) ? void (0) : __assert_fail ("LD && \"Unexpected null LoadSDNode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7340, __extension__ __PRETTY_FUNCTION__)); | |||
| 7341 | return getTargetConstantFromNode(LD); | |||
| 7342 | } | |||
| 7343 | ||||
| 7344 | // Extract raw constant bits from constant pools. | |||
| 7345 | static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, | |||
| 7346 | APInt &UndefElts, | |||
| 7347 | SmallVectorImpl<APInt> &EltBits, | |||
| 7348 | bool AllowWholeUndefs = true, | |||
| 7349 | bool AllowPartialUndefs = true) { | |||
| 7350 | assert(EltBits.empty() && "Expected an empty EltBits vector")(static_cast <bool> (EltBits.empty() && "Expected an empty EltBits vector" ) ? void (0) : __assert_fail ("EltBits.empty() && \"Expected an empty EltBits vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7350, __extension__ __PRETTY_FUNCTION__)); | |||
| 7351 | ||||
| 7352 | Op = peekThroughBitcasts(Op); | |||
| 7353 | ||||
| 7354 | EVT VT = Op.getValueType(); | |||
| 7355 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 7356 | assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!")(static_cast <bool> ((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!") ? void (0) : __assert_fail ("(SizeInBits % EltSizeInBits) == 0 && \"Can't split constant!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7356, __extension__ __PRETTY_FUNCTION__)); | |||
| 7357 | unsigned NumElts = SizeInBits / EltSizeInBits; | |||
| 7358 | ||||
| 7359 | // Bitcast a source array of element bits to the target size. | |||
| 7360 | auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) { | |||
| 7361 | unsigned NumSrcElts = UndefSrcElts.getBitWidth(); | |||
| 7362 | unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth(); | |||
| 7363 | assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match") ? void (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__ __PRETTY_FUNCTION__)) | |||
| 7364 | "Constant bit sizes don't match")(static_cast <bool> ((NumSrcElts * SrcEltSizeInBits) == SizeInBits && "Constant bit sizes don't match") ? void (0) : __assert_fail ("(NumSrcElts * SrcEltSizeInBits) == SizeInBits && \"Constant bit sizes don't match\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7364, __extension__ __PRETTY_FUNCTION__)); | |||
| 7365 | ||||
| 7366 | // Don't split if we don't allow undef bits. | |||
| 7367 | bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs; | |||
| 7368 | if (UndefSrcElts.getBoolValue() && !AllowUndefs) | |||
| 7369 | return false; | |||
| 7370 | ||||
| 7371 | // If we're already the right size, don't bother bitcasting. | |||
| 7372 | if (NumSrcElts == NumElts) { | |||
| 7373 | UndefElts = UndefSrcElts; | |||
| 7374 | EltBits.assign(SrcEltBits.begin(), SrcEltBits.end()); | |||
| 7375 | return true; | |||
| 7376 | } | |||
| 7377 | ||||
| 7378 | // Extract all the undef/constant element data and pack into single bitsets. | |||
| 7379 | APInt UndefBits(SizeInBits, 0); | |||
| 7380 | APInt MaskBits(SizeInBits, 0); | |||
| 7381 | ||||
| 7382 | for (unsigned i = 0; i != NumSrcElts; ++i) { | |||
| 7383 | unsigned BitOffset = i * SrcEltSizeInBits; | |||
| 7384 | if (UndefSrcElts[i]) | |||
| 7385 | UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits); | |||
| 7386 | MaskBits.insertBits(SrcEltBits[i], BitOffset); | |||
| 7387 | } | |||
| 7388 | ||||
| 7389 | // Split the undef/constant single bitset data into the target elements. | |||
| 7390 | UndefElts = APInt(NumElts, 0); | |||
| 7391 | EltBits.resize(NumElts, APInt(EltSizeInBits, 0)); | |||
| 7392 | ||||
| 7393 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 7394 | unsigned BitOffset = i * EltSizeInBits; | |||
| 7395 | APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset); | |||
| 7396 | ||||
| 7397 | // Only treat an element as UNDEF if all bits are UNDEF. | |||
| 7398 | if (UndefEltBits.isAllOnes()) { | |||
| 7399 | if (!AllowWholeUndefs) | |||
| 7400 | return false; | |||
| 7401 | UndefElts.setBit(i); | |||
| 7402 | continue; | |||
| 7403 | } | |||
| 7404 | ||||
| 7405 | // If only some bits are UNDEF then treat them as zero (or bail if not | |||
| 7406 | // supported). | |||
| 7407 | if (UndefEltBits.getBoolValue() && !AllowPartialUndefs) | |||
| 7408 | return false; | |||
| 7409 | ||||
| 7410 | EltBits[i] = MaskBits.extractBits(EltSizeInBits, BitOffset); | |||
| 7411 | } | |||
| 7412 | return true; | |||
| 7413 | }; | |||
| 7414 | ||||
| 7415 | // Collect constant bits and insert into mask/undef bit masks. | |||
| 7416 | auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs, | |||
| 7417 | unsigned UndefBitIndex) { | |||
| 7418 | if (!Cst) | |||
| 7419 | return false; | |||
| 7420 | if (isa<UndefValue>(Cst)) { | |||
| 7421 | Undefs.setBit(UndefBitIndex); | |||
| 7422 | return true; | |||
| 7423 | } | |||
| 7424 | if (auto *CInt = dyn_cast<ConstantInt>(Cst)) { | |||
| 7425 | Mask = CInt->getValue(); | |||
| 7426 | return true; | |||
| 7427 | } | |||
| 7428 | if (auto *CFP = dyn_cast<ConstantFP>(Cst)) { | |||
| 7429 | Mask = CFP->getValueAPF().bitcastToAPInt(); | |||
| 7430 | return true; | |||
| 7431 | } | |||
| 7432 | return false; | |||
| 7433 | }; | |||
| 7434 | ||||
| 7435 | // Handle UNDEFs. | |||
| 7436 | if (Op.isUndef()) { | |||
| 7437 | APInt UndefSrcElts = APInt::getAllOnes(NumElts); | |||
| 7438 | SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); | |||
| 7439 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7440 | } | |||
| 7441 | ||||
| 7442 | // Extract scalar constant bits. | |||
| 7443 | if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) { | |||
| 7444 | APInt UndefSrcElts = APInt::getZero(1); | |||
| 7445 | SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue()); | |||
| 7446 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7447 | } | |||
| 7448 | if (auto *Cst = dyn_cast<ConstantFPSDNode>(Op)) { | |||
| 7449 | APInt UndefSrcElts = APInt::getZero(1); | |||
| 7450 | APInt RawBits = Cst->getValueAPF().bitcastToAPInt(); | |||
| 7451 | SmallVector<APInt, 64> SrcEltBits(1, RawBits); | |||
| 7452 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7453 | } | |||
| 7454 | ||||
| 7455 | // Extract constant bits from build vector. | |||
| 7456 | if (auto *BV = dyn_cast<BuildVectorSDNode>(Op)) { | |||
| 7457 | BitVector Undefs; | |||
| 7458 | SmallVector<APInt> SrcEltBits; | |||
| 7459 | unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); | |||
| 7460 | if (BV->getConstantRawBits(true, SrcEltSizeInBits, SrcEltBits, Undefs)) { | |||
| 7461 | APInt UndefSrcElts = APInt::getZero(SrcEltBits.size()); | |||
| 7462 | for (unsigned I = 0, E = SrcEltBits.size(); I != E; ++I) | |||
| 7463 | if (Undefs[I]) | |||
| 7464 | UndefSrcElts.setBit(I); | |||
| 7465 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7466 | } | |||
| 7467 | } | |||
| 7468 | ||||
| 7469 | // Extract constant bits from constant pool vector. | |||
| 7470 | if (auto *Cst = getTargetConstantFromNode(Op)) { | |||
| 7471 | Type *CstTy = Cst->getType(); | |||
| 7472 | unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); | |||
| 7473 | if (!CstTy->isVectorTy() || (CstSizeInBits % SizeInBits) != 0) | |||
| 7474 | return false; | |||
| 7475 | ||||
| 7476 | unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits(); | |||
| 7477 | unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; | |||
| 7478 | ||||
| 7479 | APInt UndefSrcElts(NumSrcElts, 0); | |||
| 7480 | SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0)); | |||
| 7481 | for (unsigned i = 0; i != NumSrcElts; ++i) | |||
| 7482 | if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i], | |||
| 7483 | UndefSrcElts, i)) | |||
| 7484 | return false; | |||
| 7485 | ||||
| 7486 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7487 | } | |||
| 7488 | ||||
| 7489 | // Extract constant bits from a broadcasted constant pool scalar. | |||
| 7490 | if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD && | |||
| 7491 | EltSizeInBits <= VT.getScalarSizeInBits()) { | |||
| 7492 | auto *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 7493 | if (MemIntr->getMemoryVT().getScalarSizeInBits() != VT.getScalarSizeInBits()) | |||
| 7494 | return false; | |||
| 7495 | ||||
| 7496 | SDValue Ptr = MemIntr->getBasePtr(); | |||
| 7497 | if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) { | |||
| 7498 | unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits(); | |||
| 7499 | unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; | |||
| 7500 | ||||
| 7501 | APInt UndefSrcElts(NumSrcElts, 0); | |||
| 7502 | SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0)); | |||
| 7503 | if (CollectConstantBits(C, SrcEltBits[0], UndefSrcElts, 0)) { | |||
| 7504 | if (UndefSrcElts[0]) | |||
| 7505 | UndefSrcElts.setBits(0, NumSrcElts); | |||
| 7506 | SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]); | |||
| 7507 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7508 | } | |||
| 7509 | } | |||
| 7510 | } | |||
| 7511 | ||||
| 7512 | // Extract constant bits from a subvector broadcast. | |||
| 7513 | if (Op.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { | |||
| 7514 | auto *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 7515 | SDValue Ptr = MemIntr->getBasePtr(); | |||
| 7516 | // The source constant may be larger than the subvector broadcast, | |||
| 7517 | // ensure we extract the correct subvector constants. | |||
| 7518 | if (const Constant *Cst = getTargetConstantFromBasePtr(Ptr)) { | |||
| 7519 | Type *CstTy = Cst->getType(); | |||
| 7520 | unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits(); | |||
| 7521 | unsigned SubVecSizeInBits = MemIntr->getMemoryVT().getStoreSizeInBits(); | |||
| 7522 | if (!CstTy->isVectorTy() || (CstSizeInBits % SubVecSizeInBits) != 0 || | |||
| 7523 | (SizeInBits % SubVecSizeInBits) != 0) | |||
| 7524 | return false; | |||
| 7525 | unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits(); | |||
| 7526 | unsigned NumSubElts = SubVecSizeInBits / CstEltSizeInBits; | |||
| 7527 | unsigned NumSubVecs = SizeInBits / SubVecSizeInBits; | |||
| 7528 | APInt UndefSubElts(NumSubElts, 0); | |||
| 7529 | SmallVector<APInt, 64> SubEltBits(NumSubElts * NumSubVecs, | |||
| 7530 | APInt(CstEltSizeInBits, 0)); | |||
| 7531 | for (unsigned i = 0; i != NumSubElts; ++i) { | |||
| 7532 | if (!CollectConstantBits(Cst->getAggregateElement(i), SubEltBits[i], | |||
| 7533 | UndefSubElts, i)) | |||
| 7534 | return false; | |||
| 7535 | for (unsigned j = 1; j != NumSubVecs; ++j) | |||
| 7536 | SubEltBits[i + (j * NumSubElts)] = SubEltBits[i]; | |||
| 7537 | } | |||
| 7538 | UndefSubElts = APInt::getSplat(NumSubVecs * UndefSubElts.getBitWidth(), | |||
| 7539 | UndefSubElts); | |||
| 7540 | return CastBitData(UndefSubElts, SubEltBits); | |||
| 7541 | } | |||
| 7542 | } | |||
| 7543 | ||||
| 7544 | // Extract a rematerialized scalar constant insertion. | |||
| 7545 | if (Op.getOpcode() == X86ISD::VZEXT_MOVL && | |||
| 7546 | Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 7547 | isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) { | |||
| 7548 | unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); | |||
| 7549 | unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits; | |||
| 7550 | ||||
| 7551 | APInt UndefSrcElts(NumSrcElts, 0); | |||
| 7552 | SmallVector<APInt, 64> SrcEltBits; | |||
| 7553 | auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0)); | |||
| 7554 | SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits)); | |||
| 7555 | SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0)); | |||
| 7556 | return CastBitData(UndefSrcElts, SrcEltBits); | |||
| 7557 | } | |||
| 7558 | ||||
| 7559 | // Insert constant bits from a base and sub vector sources. | |||
| 7560 | if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) { | |||
| 7561 | // If bitcasts to larger elements we might lose track of undefs - don't | |||
| 7562 | // allow any to be safe. | |||
| 7563 | unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); | |||
| 7564 | bool AllowUndefs = EltSizeInBits >= SrcEltSizeInBits; | |||
| 7565 | ||||
| 7566 | APInt UndefSrcElts, UndefSubElts; | |||
| 7567 | SmallVector<APInt, 32> EltSrcBits, EltSubBits; | |||
| 7568 | if (getTargetConstantBitsFromNode(Op.getOperand(1), SrcEltSizeInBits, | |||
| 7569 | UndefSubElts, EltSubBits, | |||
| 7570 | AllowWholeUndefs && AllowUndefs, | |||
| 7571 | AllowPartialUndefs && AllowUndefs) && | |||
| 7572 | getTargetConstantBitsFromNode(Op.getOperand(0), SrcEltSizeInBits, | |||
| 7573 | UndefSrcElts, EltSrcBits, | |||
| 7574 | AllowWholeUndefs && AllowUndefs, | |||
| 7575 | AllowPartialUndefs && AllowUndefs)) { | |||
| 7576 | unsigned BaseIdx = Op.getConstantOperandVal(2); | |||
| 7577 | UndefSrcElts.insertBits(UndefSubElts, BaseIdx); | |||
| 7578 | for (unsigned i = 0, e = EltSubBits.size(); i != e; ++i) | |||
| 7579 | EltSrcBits[BaseIdx + i] = EltSubBits[i]; | |||
| 7580 | return CastBitData(UndefSrcElts, EltSrcBits); | |||
| 7581 | } | |||
| 7582 | } | |||
| 7583 | ||||
| 7584 | // Extract constant bits from a subvector's source. | |||
| 7585 | if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) { | |||
| 7586 | // TODO - support extract_subvector through bitcasts. | |||
| 7587 | if (EltSizeInBits != VT.getScalarSizeInBits()) | |||
| 7588 | return false; | |||
| 7589 | ||||
| 7590 | if (getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, | |||
| 7591 | UndefElts, EltBits, AllowWholeUndefs, | |||
| 7592 | AllowPartialUndefs)) { | |||
| 7593 | EVT SrcVT = Op.getOperand(0).getValueType(); | |||
| 7594 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 7595 | unsigned NumSubElts = VT.getVectorNumElements(); | |||
| 7596 | unsigned BaseIdx = Op.getConstantOperandVal(1); | |||
| 7597 | UndefElts = UndefElts.extractBits(NumSubElts, BaseIdx); | |||
| 7598 | if ((BaseIdx + NumSubElts) != NumSrcElts) | |||
| 7599 | EltBits.erase(EltBits.begin() + BaseIdx + NumSubElts, EltBits.end()); | |||
| 7600 | if (BaseIdx != 0) | |||
| 7601 | EltBits.erase(EltBits.begin(), EltBits.begin() + BaseIdx); | |||
| 7602 | return true; | |||
| 7603 | } | |||
| 7604 | } | |||
| 7605 | ||||
| 7606 | // Extract constant bits from shuffle node sources. | |||
| 7607 | if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Op)) { | |||
| 7608 | // TODO - support shuffle through bitcasts. | |||
| 7609 | if (EltSizeInBits != VT.getScalarSizeInBits()) | |||
| 7610 | return false; | |||
| 7611 | ||||
| 7612 | ArrayRef<int> Mask = SVN->getMask(); | |||
| 7613 | if ((!AllowWholeUndefs || !AllowPartialUndefs) && | |||
| 7614 | llvm::any_of(Mask, [](int M) { return M < 0; })) | |||
| 7615 | return false; | |||
| 7616 | ||||
| 7617 | APInt UndefElts0, UndefElts1; | |||
| 7618 | SmallVector<APInt, 32> EltBits0, EltBits1; | |||
| 7619 | if (isAnyInRange(Mask, 0, NumElts) && | |||
| 7620 | !getTargetConstantBitsFromNode(Op.getOperand(0), EltSizeInBits, | |||
| 7621 | UndefElts0, EltBits0, AllowWholeUndefs, | |||
| 7622 | AllowPartialUndefs)) | |||
| 7623 | return false; | |||
| 7624 | if (isAnyInRange(Mask, NumElts, 2 * NumElts) && | |||
| 7625 | !getTargetConstantBitsFromNode(Op.getOperand(1), EltSizeInBits, | |||
| 7626 | UndefElts1, EltBits1, AllowWholeUndefs, | |||
| 7627 | AllowPartialUndefs)) | |||
| 7628 | return false; | |||
| 7629 | ||||
| 7630 | UndefElts = APInt::getZero(NumElts); | |||
| 7631 | for (int i = 0; i != (int)NumElts; ++i) { | |||
| 7632 | int M = Mask[i]; | |||
| 7633 | if (M < 0) { | |||
| 7634 | UndefElts.setBit(i); | |||
| 7635 | EltBits.push_back(APInt::getZero(EltSizeInBits)); | |||
| 7636 | } else if (M < (int)NumElts) { | |||
| 7637 | if (UndefElts0[M]) | |||
| 7638 | UndefElts.setBit(i); | |||
| 7639 | EltBits.push_back(EltBits0[M]); | |||
| 7640 | } else { | |||
| 7641 | if (UndefElts1[M - NumElts]) | |||
| 7642 | UndefElts.setBit(i); | |||
| 7643 | EltBits.push_back(EltBits1[M - NumElts]); | |||
| 7644 | } | |||
| 7645 | } | |||
| 7646 | return true; | |||
| 7647 | } | |||
| 7648 | ||||
| 7649 | return false; | |||
| 7650 | } | |||
| 7651 | ||||
| 7652 | namespace llvm { | |||
| 7653 | namespace X86 { | |||
| 7654 | bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) { | |||
| 7655 | APInt UndefElts; | |||
| 7656 | SmallVector<APInt, 16> EltBits; | |||
| 7657 | if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(), | |||
| 7658 | UndefElts, EltBits, true, | |||
| 7659 | AllowPartialUndefs)) { | |||
| 7660 | int SplatIndex = -1; | |||
| 7661 | for (int i = 0, e = EltBits.size(); i != e; ++i) { | |||
| 7662 | if (UndefElts[i]) | |||
| 7663 | continue; | |||
| 7664 | if (0 <= SplatIndex && EltBits[i] != EltBits[SplatIndex]) { | |||
| 7665 | SplatIndex = -1; | |||
| 7666 | break; | |||
| 7667 | } | |||
| 7668 | SplatIndex = i; | |||
| 7669 | } | |||
| 7670 | if (0 <= SplatIndex) { | |||
| 7671 | SplatVal = EltBits[SplatIndex]; | |||
| 7672 | return true; | |||
| 7673 | } | |||
| 7674 | } | |||
| 7675 | ||||
| 7676 | return false; | |||
| 7677 | } | |||
| 7678 | } // namespace X86 | |||
| 7679 | } // namespace llvm | |||
| 7680 | ||||
| 7681 | static bool getTargetShuffleMaskIndices(SDValue MaskNode, | |||
| 7682 | unsigned MaskEltSizeInBits, | |||
| 7683 | SmallVectorImpl<uint64_t> &RawMask, | |||
| 7684 | APInt &UndefElts) { | |||
| 7685 | // Extract the raw target constant bits. | |||
| 7686 | SmallVector<APInt, 64> EltBits; | |||
| 7687 | if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts, | |||
| 7688 | EltBits, /* AllowWholeUndefs */ true, | |||
| 7689 | /* AllowPartialUndefs */ false)) | |||
| 7690 | return false; | |||
| 7691 | ||||
| 7692 | // Insert the extracted elements into the mask. | |||
| 7693 | for (const APInt &Elt : EltBits) | |||
| 7694 | RawMask.push_back(Elt.getZExtValue()); | |||
| 7695 | ||||
| 7696 | return true; | |||
| 7697 | } | |||
| 7698 | ||||
| 7699 | /// Create a shuffle mask that matches the PACKSS/PACKUS truncation. | |||
| 7700 | /// A multi-stage pack shuffle mask is created by specifying NumStages > 1. | |||
| 7701 | /// Note: This ignores saturation, so inputs must be checked first. | |||
| 7702 | static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, | |||
| 7703 | bool Unary, unsigned NumStages = 1) { | |||
| 7704 | assert(Mask.empty() && "Expected an empty shuffle mask vector")(static_cast <bool> (Mask.empty() && "Expected an empty shuffle mask vector" ) ? void (0) : __assert_fail ("Mask.empty() && \"Expected an empty shuffle mask vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7704, __extension__ __PRETTY_FUNCTION__)); | |||
| 7705 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 7706 | unsigned NumLanes = VT.getSizeInBits() / 128; | |||
| 7707 | unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); | |||
| 7708 | unsigned Offset = Unary ? 0 : NumElts; | |||
| 7709 | unsigned Repetitions = 1u << (NumStages - 1); | |||
| 7710 | unsigned Increment = 1u << NumStages; | |||
| 7711 | assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction")(static_cast <bool> ((NumEltsPerLane >> NumStages ) > 0 && "Illegal packing compaction") ? void (0) : __assert_fail ("(NumEltsPerLane >> NumStages) > 0 && \"Illegal packing compaction\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7711, __extension__ __PRETTY_FUNCTION__)); | |||
| 7712 | ||||
| 7713 | for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 7714 | for (unsigned Stage = 0; Stage != Repetitions; ++Stage) { | |||
| 7715 | for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) | |||
| 7716 | Mask.push_back(Elt + (Lane * NumEltsPerLane)); | |||
| 7717 | for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment) | |||
| 7718 | Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); | |||
| 7719 | } | |||
| 7720 | } | |||
| 7721 | } | |||
| 7722 | ||||
| 7723 | // Split the demanded elts of a PACKSS/PACKUS node between its operands. | |||
| 7724 | static void getPackDemandedElts(EVT VT, const APInt &DemandedElts, | |||
| 7725 | APInt &DemandedLHS, APInt &DemandedRHS) { | |||
| 7726 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 7727 | int NumElts = DemandedElts.getBitWidth(); | |||
| 7728 | int NumInnerElts = NumElts / 2; | |||
| 7729 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 7730 | int NumInnerEltsPerLane = NumInnerElts / NumLanes; | |||
| 7731 | ||||
| 7732 | DemandedLHS = APInt::getZero(NumInnerElts); | |||
| 7733 | DemandedRHS = APInt::getZero(NumInnerElts); | |||
| 7734 | ||||
| 7735 | // Map DemandedElts to the packed operands. | |||
| 7736 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 7737 | for (int Elt = 0; Elt != NumInnerEltsPerLane; ++Elt) { | |||
| 7738 | int OuterIdx = (Lane * NumEltsPerLane) + Elt; | |||
| 7739 | int InnerIdx = (Lane * NumInnerEltsPerLane) + Elt; | |||
| 7740 | if (DemandedElts[OuterIdx]) | |||
| 7741 | DemandedLHS.setBit(InnerIdx); | |||
| 7742 | if (DemandedElts[OuterIdx + NumInnerEltsPerLane]) | |||
| 7743 | DemandedRHS.setBit(InnerIdx); | |||
| 7744 | } | |||
| 7745 | } | |||
| 7746 | } | |||
| 7747 | ||||
| 7748 | // Split the demanded elts of a HADD/HSUB node between its operands. | |||
| 7749 | static void getHorizDemandedElts(EVT VT, const APInt &DemandedElts, | |||
| 7750 | APInt &DemandedLHS, APInt &DemandedRHS) { | |||
| 7751 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 7752 | int NumElts = DemandedElts.getBitWidth(); | |||
| 7753 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 7754 | int HalfEltsPerLane = NumEltsPerLane / 2; | |||
| 7755 | ||||
| 7756 | DemandedLHS = APInt::getZero(NumElts); | |||
| 7757 | DemandedRHS = APInt::getZero(NumElts); | |||
| 7758 | ||||
| 7759 | // Map DemandedElts to the horizontal operands. | |||
| 7760 | for (int Idx = 0; Idx != NumElts; ++Idx) { | |||
| 7761 | if (!DemandedElts[Idx]) | |||
| 7762 | continue; | |||
| 7763 | int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane; | |||
| 7764 | int LocalIdx = Idx % NumEltsPerLane; | |||
| 7765 | if (LocalIdx < HalfEltsPerLane) { | |||
| 7766 | DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 0); | |||
| 7767 | DemandedLHS.setBit(LaneIdx + 2 * LocalIdx + 1); | |||
| 7768 | } else { | |||
| 7769 | LocalIdx -= HalfEltsPerLane; | |||
| 7770 | DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 0); | |||
| 7771 | DemandedRHS.setBit(LaneIdx + 2 * LocalIdx + 1); | |||
| 7772 | } | |||
| 7773 | } | |||
| 7774 | } | |||
| 7775 | ||||
| 7776 | /// Calculates the shuffle mask corresponding to the target-specific opcode. | |||
| 7777 | /// If the mask could be calculated, returns it in \p Mask, returns the shuffle | |||
| 7778 | /// operands in \p Ops, and returns true. | |||
| 7779 | /// Sets \p IsUnary to true if only one source is used. Note that this will set | |||
| 7780 | /// IsUnary for shuffles which use a single input multiple times, and in those | |||
| 7781 | /// cases it will adjust the mask to only have indices within that single input. | |||
| 7782 | /// It is an error to call this with non-empty Mask/Ops vectors. | |||
| 7783 | static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, | |||
| 7784 | SmallVectorImpl<SDValue> &Ops, | |||
| 7785 | SmallVectorImpl<int> &Mask, bool &IsUnary) { | |||
| 7786 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 7787 | unsigned MaskEltSize = VT.getScalarSizeInBits(); | |||
| 7788 | SmallVector<uint64_t, 32> RawMask; | |||
| 7789 | APInt RawUndefs; | |||
| 7790 | uint64_t ImmN; | |||
| 7791 | ||||
| 7792 | assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector")(static_cast <bool> (Mask.empty() && "getTargetShuffleMask expects an empty Mask vector" ) ? void (0) : __assert_fail ("Mask.empty() && \"getTargetShuffleMask expects an empty Mask vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7792, __extension__ __PRETTY_FUNCTION__)); | |||
| 7793 | assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector")(static_cast <bool> (Ops.empty() && "getTargetShuffleMask expects an empty Ops vector" ) ? void (0) : __assert_fail ("Ops.empty() && \"getTargetShuffleMask expects an empty Ops vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7793, __extension__ __PRETTY_FUNCTION__)); | |||
| 7794 | ||||
| 7795 | IsUnary = false; | |||
| 7796 | bool IsFakeUnary = false; | |||
| 7797 | switch (N->getOpcode()) { | |||
| 7798 | case X86ISD::BLENDI: | |||
| 7799 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7799, __extension__ __PRETTY_FUNCTION__)); | |||
| 7800 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7800, __extension__ __PRETTY_FUNCTION__)); | |||
| 7801 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7802 | DecodeBLENDMask(NumElems, ImmN, Mask); | |||
| 7803 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7804 | break; | |||
| 7805 | case X86ISD::SHUFP: | |||
| 7806 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7806, __extension__ __PRETTY_FUNCTION__)); | |||
| 7807 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7807, __extension__ __PRETTY_FUNCTION__)); | |||
| 7808 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7809 | DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask); | |||
| 7810 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7811 | break; | |||
| 7812 | case X86ISD::INSERTPS: | |||
| 7813 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7813, __extension__ __PRETTY_FUNCTION__)); | |||
| 7814 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7814, __extension__ __PRETTY_FUNCTION__)); | |||
| 7815 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7816 | DecodeINSERTPSMask(ImmN, Mask); | |||
| 7817 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7818 | break; | |||
| 7819 | case X86ISD::EXTRQI: | |||
| 7820 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7820, __extension__ __PRETTY_FUNCTION__)); | |||
| 7821 | if (isa<ConstantSDNode>(N->getOperand(1)) && | |||
| 7822 | isa<ConstantSDNode>(N->getOperand(2))) { | |||
| 7823 | int BitLen = N->getConstantOperandVal(1); | |||
| 7824 | int BitIdx = N->getConstantOperandVal(2); | |||
| 7825 | DecodeEXTRQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); | |||
| 7826 | IsUnary = true; | |||
| 7827 | } | |||
| 7828 | break; | |||
| 7829 | case X86ISD::INSERTQI: | |||
| 7830 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7830, __extension__ __PRETTY_FUNCTION__)); | |||
| 7831 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7831, __extension__ __PRETTY_FUNCTION__)); | |||
| 7832 | if (isa<ConstantSDNode>(N->getOperand(2)) && | |||
| 7833 | isa<ConstantSDNode>(N->getOperand(3))) { | |||
| 7834 | int BitLen = N->getConstantOperandVal(2); | |||
| 7835 | int BitIdx = N->getConstantOperandVal(3); | |||
| 7836 | DecodeINSERTQIMask(NumElems, MaskEltSize, BitLen, BitIdx, Mask); | |||
| 7837 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7838 | } | |||
| 7839 | break; | |||
| 7840 | case X86ISD::UNPCKH: | |||
| 7841 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7841, __extension__ __PRETTY_FUNCTION__)); | |||
| 7842 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7842, __extension__ __PRETTY_FUNCTION__)); | |||
| 7843 | DecodeUNPCKHMask(NumElems, MaskEltSize, Mask); | |||
| 7844 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7845 | break; | |||
| 7846 | case X86ISD::UNPCKL: | |||
| 7847 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7847, __extension__ __PRETTY_FUNCTION__)); | |||
| 7848 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7848, __extension__ __PRETTY_FUNCTION__)); | |||
| 7849 | DecodeUNPCKLMask(NumElems, MaskEltSize, Mask); | |||
| 7850 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7851 | break; | |||
| 7852 | case X86ISD::MOVHLPS: | |||
| 7853 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7853, __extension__ __PRETTY_FUNCTION__)); | |||
| 7854 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7854, __extension__ __PRETTY_FUNCTION__)); | |||
| 7855 | DecodeMOVHLPSMask(NumElems, Mask); | |||
| 7856 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7857 | break; | |||
| 7858 | case X86ISD::MOVLHPS: | |||
| 7859 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7859, __extension__ __PRETTY_FUNCTION__)); | |||
| 7860 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7860, __extension__ __PRETTY_FUNCTION__)); | |||
| 7861 | DecodeMOVLHPSMask(NumElems, Mask); | |||
| 7862 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7863 | break; | |||
| 7864 | case X86ISD::VALIGN: | |||
| 7865 | assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__ __PRETTY_FUNCTION__)) | |||
| 7866 | "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7866, __extension__ __PRETTY_FUNCTION__)); | |||
| 7867 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7867, __extension__ __PRETTY_FUNCTION__)); | |||
| 7868 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7868, __extension__ __PRETTY_FUNCTION__)); | |||
| 7869 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7870 | DecodeVALIGNMask(NumElems, ImmN, Mask); | |||
| 7871 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7872 | Ops.push_back(N->getOperand(1)); | |||
| 7873 | Ops.push_back(N->getOperand(0)); | |||
| 7874 | break; | |||
| 7875 | case X86ISD::PALIGNR: | |||
| 7876 | assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7876, __extension__ __PRETTY_FUNCTION__)); | |||
| 7877 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7877, __extension__ __PRETTY_FUNCTION__)); | |||
| 7878 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7878, __extension__ __PRETTY_FUNCTION__)); | |||
| 7879 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7880 | DecodePALIGNRMask(NumElems, ImmN, Mask); | |||
| 7881 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7882 | Ops.push_back(N->getOperand(1)); | |||
| 7883 | Ops.push_back(N->getOperand(0)); | |||
| 7884 | break; | |||
| 7885 | case X86ISD::VSHLDQ: | |||
| 7886 | assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7886, __extension__ __PRETTY_FUNCTION__)); | |||
| 7887 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7887, __extension__ __PRETTY_FUNCTION__)); | |||
| 7888 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7889 | DecodePSLLDQMask(NumElems, ImmN, Mask); | |||
| 7890 | IsUnary = true; | |||
| 7891 | break; | |||
| 7892 | case X86ISD::VSRLDQ: | |||
| 7893 | assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7893, __extension__ __PRETTY_FUNCTION__)); | |||
| 7894 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7894, __extension__ __PRETTY_FUNCTION__)); | |||
| 7895 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7896 | DecodePSRLDQMask(NumElems, ImmN, Mask); | |||
| 7897 | IsUnary = true; | |||
| 7898 | break; | |||
| 7899 | case X86ISD::PSHUFD: | |||
| 7900 | case X86ISD::VPERMILPI: | |||
| 7901 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7901, __extension__ __PRETTY_FUNCTION__)); | |||
| 7902 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7903 | DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask); | |||
| 7904 | IsUnary = true; | |||
| 7905 | break; | |||
| 7906 | case X86ISD::PSHUFHW: | |||
| 7907 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7907, __extension__ __PRETTY_FUNCTION__)); | |||
| 7908 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7909 | DecodePSHUFHWMask(NumElems, ImmN, Mask); | |||
| 7910 | IsUnary = true; | |||
| 7911 | break; | |||
| 7912 | case X86ISD::PSHUFLW: | |||
| 7913 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7913, __extension__ __PRETTY_FUNCTION__)); | |||
| 7914 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7915 | DecodePSHUFLWMask(NumElems, ImmN, Mask); | |||
| 7916 | IsUnary = true; | |||
| 7917 | break; | |||
| 7918 | case X86ISD::VZEXT_MOVL: | |||
| 7919 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7919, __extension__ __PRETTY_FUNCTION__)); | |||
| 7920 | DecodeZeroMoveLowMask(NumElems, Mask); | |||
| 7921 | IsUnary = true; | |||
| 7922 | break; | |||
| 7923 | case X86ISD::VBROADCAST: | |||
| 7924 | // We only decode broadcasts of same-sized vectors, peeking through to | |||
| 7925 | // extracted subvectors is likely to cause hasOneUse issues with | |||
| 7926 | // SimplifyDemandedBits etc. | |||
| 7927 | if (N->getOperand(0).getValueType() == VT) { | |||
| 7928 | DecodeVectorBroadcast(NumElems, Mask); | |||
| 7929 | IsUnary = true; | |||
| 7930 | break; | |||
| 7931 | } | |||
| 7932 | return false; | |||
| 7933 | case X86ISD::VPERMILPV: { | |||
| 7934 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7934, __extension__ __PRETTY_FUNCTION__)); | |||
| 7935 | IsUnary = true; | |||
| 7936 | SDValue MaskNode = N->getOperand(1); | |||
| 7937 | if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, | |||
| 7938 | RawUndefs)) { | |||
| 7939 | DecodeVPERMILPMask(NumElems, MaskEltSize, RawMask, RawUndefs, Mask); | |||
| 7940 | break; | |||
| 7941 | } | |||
| 7942 | return false; | |||
| 7943 | } | |||
| 7944 | case X86ISD::PSHUFB: { | |||
| 7945 | assert(VT.getScalarType() == MVT::i8 && "Byte vector expected")(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Byte vector expected") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Byte vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7945, __extension__ __PRETTY_FUNCTION__)); | |||
| 7946 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7946, __extension__ __PRETTY_FUNCTION__)); | |||
| 7947 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7947, __extension__ __PRETTY_FUNCTION__)); | |||
| 7948 | IsUnary = true; | |||
| 7949 | SDValue MaskNode = N->getOperand(1); | |||
| 7950 | if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { | |||
| 7951 | DecodePSHUFBMask(RawMask, RawUndefs, Mask); | |||
| 7952 | break; | |||
| 7953 | } | |||
| 7954 | return false; | |||
| 7955 | } | |||
| 7956 | case X86ISD::VPERMI: | |||
| 7957 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7957, __extension__ __PRETTY_FUNCTION__)); | |||
| 7958 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7959 | DecodeVPERMMask(NumElems, ImmN, Mask); | |||
| 7960 | IsUnary = true; | |||
| 7961 | break; | |||
| 7962 | case X86ISD::MOVSS: | |||
| 7963 | case X86ISD::MOVSD: | |||
| 7964 | case X86ISD::MOVSH: | |||
| 7965 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7965, __extension__ __PRETTY_FUNCTION__)); | |||
| 7966 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7966, __extension__ __PRETTY_FUNCTION__)); | |||
| 7967 | DecodeScalarMoveMask(NumElems, /* IsLoad */ false, Mask); | |||
| 7968 | break; | |||
| 7969 | case X86ISD::VPERM2X128: | |||
| 7970 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7970, __extension__ __PRETTY_FUNCTION__)); | |||
| 7971 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7971, __extension__ __PRETTY_FUNCTION__)); | |||
| 7972 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7973 | DecodeVPERM2X128Mask(NumElems, ImmN, Mask); | |||
| 7974 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7975 | break; | |||
| 7976 | case X86ISD::SHUF128: | |||
| 7977 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7977, __extension__ __PRETTY_FUNCTION__)); | |||
| 7978 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7978, __extension__ __PRETTY_FUNCTION__)); | |||
| 7979 | ImmN = N->getConstantOperandVal(N->getNumOperands() - 1); | |||
| 7980 | decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask); | |||
| 7981 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 7982 | break; | |||
| 7983 | case X86ISD::MOVSLDUP: | |||
| 7984 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7984, __extension__ __PRETTY_FUNCTION__)); | |||
| 7985 | DecodeMOVSLDUPMask(NumElems, Mask); | |||
| 7986 | IsUnary = true; | |||
| 7987 | break; | |||
| 7988 | case X86ISD::MOVSHDUP: | |||
| 7989 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7989, __extension__ __PRETTY_FUNCTION__)); | |||
| 7990 | DecodeMOVSHDUPMask(NumElems, Mask); | |||
| 7991 | IsUnary = true; | |||
| 7992 | break; | |||
| 7993 | case X86ISD::MOVDDUP: | |||
| 7994 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7994, __extension__ __PRETTY_FUNCTION__)); | |||
| 7995 | DecodeMOVDDUPMask(NumElems, Mask); | |||
| 7996 | IsUnary = true; | |||
| 7997 | break; | |||
| 7998 | case X86ISD::VPERMIL2: { | |||
| 7999 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 7999, __extension__ __PRETTY_FUNCTION__)); | |||
| 8000 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8000, __extension__ __PRETTY_FUNCTION__)); | |||
| 8001 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 8002 | SDValue MaskNode = N->getOperand(2); | |||
| 8003 | SDValue CtrlNode = N->getOperand(3); | |||
| 8004 | if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) { | |||
| 8005 | unsigned CtrlImm = CtrlOp->getZExtValue(); | |||
| 8006 | if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, | |||
| 8007 | RawUndefs)) { | |||
| 8008 | DecodeVPERMIL2PMask(NumElems, MaskEltSize, CtrlImm, RawMask, RawUndefs, | |||
| 8009 | Mask); | |||
| 8010 | break; | |||
| 8011 | } | |||
| 8012 | } | |||
| 8013 | return false; | |||
| 8014 | } | |||
| 8015 | case X86ISD::VPPERM: { | |||
| 8016 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8016, __extension__ __PRETTY_FUNCTION__)); | |||
| 8017 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8017, __extension__ __PRETTY_FUNCTION__)); | |||
| 8018 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); | |||
| 8019 | SDValue MaskNode = N->getOperand(2); | |||
| 8020 | if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask, RawUndefs)) { | |||
| 8021 | DecodeVPPERMMask(RawMask, RawUndefs, Mask); | |||
| 8022 | break; | |||
| 8023 | } | |||
| 8024 | return false; | |||
| 8025 | } | |||
| 8026 | case X86ISD::VPERMV: { | |||
| 8027 | assert(N->getOperand(1).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(1).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(1).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8027, __extension__ __PRETTY_FUNCTION__)); | |||
| 8028 | IsUnary = true; | |||
| 8029 | // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. | |||
| 8030 | Ops.push_back(N->getOperand(1)); | |||
| 8031 | SDValue MaskNode = N->getOperand(0); | |||
| 8032 | if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, | |||
| 8033 | RawUndefs)) { | |||
| 8034 | DecodeVPERMVMask(RawMask, RawUndefs, Mask); | |||
| 8035 | break; | |||
| 8036 | } | |||
| 8037 | return false; | |||
| 8038 | } | |||
| 8039 | case X86ISD::VPERMV3: { | |||
| 8040 | assert(N->getOperand(0).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(0).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(0).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8040, __extension__ __PRETTY_FUNCTION__)); | |||
| 8041 | assert(N->getOperand(2).getValueType() == VT && "Unexpected value type")(static_cast <bool> (N->getOperand(2).getValueType() == VT && "Unexpected value type") ? void (0) : __assert_fail ("N->getOperand(2).getValueType() == VT && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8041, __extension__ __PRETTY_FUNCTION__)); | |||
| 8042 | IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); | |||
| 8043 | // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. | |||
| 8044 | Ops.push_back(N->getOperand(0)); | |||
| 8045 | Ops.push_back(N->getOperand(2)); | |||
| 8046 | SDValue MaskNode = N->getOperand(1); | |||
| 8047 | if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask, | |||
| 8048 | RawUndefs)) { | |||
| 8049 | DecodeVPERMV3Mask(RawMask, RawUndefs, Mask); | |||
| 8050 | break; | |||
| 8051 | } | |||
| 8052 | return false; | |||
| 8053 | } | |||
| 8054 | default: llvm_unreachable("unknown target shuffle node")::llvm::llvm_unreachable_internal("unknown target shuffle node" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8054); | |||
| 8055 | } | |||
| 8056 | ||||
| 8057 | // Empty mask indicates the decode failed. | |||
| 8058 | if (Mask.empty()) | |||
| 8059 | return false; | |||
| 8060 | ||||
| 8061 | // Check if we're getting a shuffle mask with zero'd elements. | |||
| 8062 | if (!AllowSentinelZero && isAnyZero(Mask)) | |||
| 8063 | return false; | |||
| 8064 | ||||
| 8065 | // If we have a fake unary shuffle, the shuffle mask is spread across two | |||
| 8066 | // inputs that are actually the same node. Re-map the mask to always point | |||
| 8067 | // into the first input. | |||
| 8068 | if (IsFakeUnary) | |||
| 8069 | for (int &M : Mask) | |||
| 8070 | if (M >= (int)Mask.size()) | |||
| 8071 | M -= Mask.size(); | |||
| 8072 | ||||
| 8073 | // If we didn't already add operands in the opcode-specific code, default to | |||
| 8074 | // adding 1 or 2 operands starting at 0. | |||
| 8075 | if (Ops.empty()) { | |||
| 8076 | Ops.push_back(N->getOperand(0)); | |||
| 8077 | if (!IsUnary || IsFakeUnary) | |||
| 8078 | Ops.push_back(N->getOperand(1)); | |||
| 8079 | } | |||
| 8080 | ||||
| 8081 | return true; | |||
| 8082 | } | |||
| 8083 | ||||
| 8084 | // Wrapper for getTargetShuffleMask with InUnary; | |||
| 8085 | static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, | |||
| 8086 | SmallVectorImpl<SDValue> &Ops, | |||
| 8087 | SmallVectorImpl<int> &Mask) { | |||
| 8088 | bool IsUnary; | |||
| 8089 | return getTargetShuffleMask(N, VT, AllowSentinelZero, Ops, Mask, IsUnary); | |||
| 8090 | } | |||
| 8091 | ||||
| 8092 | /// Compute whether each element of a shuffle is zeroable. | |||
| 8093 | /// | |||
| 8094 | /// A "zeroable" vector shuffle element is one which can be lowered to zero. | |||
| 8095 | /// Either it is an undef element in the shuffle mask, the element of the input | |||
| 8096 | /// referenced is undef, or the element of the input referenced is known to be | |||
| 8097 | /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle | |||
| 8098 | /// as many lanes with this technique as possible to simplify the remaining | |||
| 8099 | /// shuffle. | |||
| 8100 | static void computeZeroableShuffleElements(ArrayRef<int> Mask, | |||
| 8101 | SDValue V1, SDValue V2, | |||
| 8102 | APInt &KnownUndef, APInt &KnownZero) { | |||
| 8103 | int Size = Mask.size(); | |||
| 8104 | KnownUndef = KnownZero = APInt::getZero(Size); | |||
| 8105 | ||||
| 8106 | V1 = peekThroughBitcasts(V1); | |||
| 8107 | V2 = peekThroughBitcasts(V2); | |||
| 8108 | ||||
| 8109 | bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); | |||
| 8110 | bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); | |||
| 8111 | ||||
| 8112 | int VectorSizeInBits = V1.getValueSizeInBits(); | |||
| 8113 | int ScalarSizeInBits = VectorSizeInBits / Size; | |||
| 8114 | assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size")(static_cast <bool> (!(VectorSizeInBits % ScalarSizeInBits ) && "Illegal shuffle mask size") ? void (0) : __assert_fail ("!(VectorSizeInBits % ScalarSizeInBits) && \"Illegal shuffle mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8114, __extension__ __PRETTY_FUNCTION__)); | |||
| 8115 | ||||
| 8116 | for (int i = 0; i < Size; ++i) { | |||
| 8117 | int M = Mask[i]; | |||
| 8118 | // Handle the easy cases. | |||
| 8119 | if (M < 0) { | |||
| 8120 | KnownUndef.setBit(i); | |||
| 8121 | continue; | |||
| 8122 | } | |||
| 8123 | if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { | |||
| 8124 | KnownZero.setBit(i); | |||
| 8125 | continue; | |||
| 8126 | } | |||
| 8127 | ||||
| 8128 | // Determine shuffle input and normalize the mask. | |||
| 8129 | SDValue V = M < Size ? V1 : V2; | |||
| 8130 | M %= Size; | |||
| 8131 | ||||
| 8132 | // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. | |||
| 8133 | if (V.getOpcode() != ISD::BUILD_VECTOR) | |||
| 8134 | continue; | |||
| 8135 | ||||
| 8136 | // If the BUILD_VECTOR has fewer elements then the bitcasted portion of | |||
| 8137 | // the (larger) source element must be UNDEF/ZERO. | |||
| 8138 | if ((Size % V.getNumOperands()) == 0) { | |||
| 8139 | int Scale = Size / V->getNumOperands(); | |||
| 8140 | SDValue Op = V.getOperand(M / Scale); | |||
| 8141 | if (Op.isUndef()) | |||
| 8142 | KnownUndef.setBit(i); | |||
| 8143 | if (X86::isZeroNode(Op)) | |||
| 8144 | KnownZero.setBit(i); | |||
| 8145 | else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) { | |||
| 8146 | APInt Val = Cst->getAPIntValue(); | |||
| 8147 | Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); | |||
| 8148 | if (Val == 0) | |||
| 8149 | KnownZero.setBit(i); | |||
| 8150 | } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) { | |||
| 8151 | APInt Val = Cst->getValueAPF().bitcastToAPInt(); | |||
| 8152 | Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits); | |||
| 8153 | if (Val == 0) | |||
| 8154 | KnownZero.setBit(i); | |||
| 8155 | } | |||
| 8156 | continue; | |||
| 8157 | } | |||
| 8158 | ||||
| 8159 | // If the BUILD_VECTOR has more elements then all the (smaller) source | |||
| 8160 | // elements must be UNDEF or ZERO. | |||
| 8161 | if ((V.getNumOperands() % Size) == 0) { | |||
| 8162 | int Scale = V->getNumOperands() / Size; | |||
| 8163 | bool AllUndef = true; | |||
| 8164 | bool AllZero = true; | |||
| 8165 | for (int j = 0; j < Scale; ++j) { | |||
| 8166 | SDValue Op = V.getOperand((M * Scale) + j); | |||
| 8167 | AllUndef &= Op.isUndef(); | |||
| 8168 | AllZero &= X86::isZeroNode(Op); | |||
| 8169 | } | |||
| 8170 | if (AllUndef) | |||
| 8171 | KnownUndef.setBit(i); | |||
| 8172 | if (AllZero) | |||
| 8173 | KnownZero.setBit(i); | |||
| 8174 | continue; | |||
| 8175 | } | |||
| 8176 | } | |||
| 8177 | } | |||
| 8178 | ||||
| 8179 | /// Decode a target shuffle mask and inputs and see if any values are | |||
| 8180 | /// known to be undef or zero from their inputs. | |||
| 8181 | /// Returns true if the target shuffle mask was decoded. | |||
| 8182 | /// FIXME: Merge this with computeZeroableShuffleElements? | |||
| 8183 | static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask, | |||
| 8184 | SmallVectorImpl<SDValue> &Ops, | |||
| 8185 | APInt &KnownUndef, APInt &KnownZero) { | |||
| 8186 | bool IsUnary; | |||
| 8187 | if (!isTargetShuffle(N.getOpcode())) | |||
| 8188 | return false; | |||
| 8189 | ||||
| 8190 | MVT VT = N.getSimpleValueType(); | |||
| 8191 | if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary)) | |||
| 8192 | return false; | |||
| 8193 | ||||
| 8194 | int Size = Mask.size(); | |||
| 8195 | SDValue V1 = Ops[0]; | |||
| 8196 | SDValue V2 = IsUnary ? V1 : Ops[1]; | |||
| 8197 | KnownUndef = KnownZero = APInt::getZero(Size); | |||
| 8198 | ||||
| 8199 | V1 = peekThroughBitcasts(V1); | |||
| 8200 | V2 = peekThroughBitcasts(V2); | |||
| 8201 | ||||
| 8202 | assert((VT.getSizeInBits() % Size) == 0 &&(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type") ? void (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__ __PRETTY_FUNCTION__)) | |||
| 8203 | "Illegal split of shuffle value type")(static_cast <bool> ((VT.getSizeInBits() % Size) == 0 && "Illegal split of shuffle value type") ? void (0) : __assert_fail ("(VT.getSizeInBits() % Size) == 0 && \"Illegal split of shuffle value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8203, __extension__ __PRETTY_FUNCTION__)); | |||
| 8204 | unsigned EltSizeInBits = VT.getSizeInBits() / Size; | |||
| 8205 | ||||
| 8206 | // Extract known constant input data. | |||
| 8207 | APInt UndefSrcElts[2]; | |||
| 8208 | SmallVector<APInt, 32> SrcEltBits[2]; | |||
| 8209 | bool IsSrcConstant[2] = { | |||
| 8210 | getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0], | |||
| 8211 | SrcEltBits[0], true, false), | |||
| 8212 | getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1], | |||
| 8213 | SrcEltBits[1], true, false)}; | |||
| 8214 | ||||
| 8215 | for (int i = 0; i < Size; ++i) { | |||
| 8216 | int M = Mask[i]; | |||
| 8217 | ||||
| 8218 | // Already decoded as SM_SentinelZero / SM_SentinelUndef. | |||
| 8219 | if (M < 0) { | |||
| 8220 | assert(isUndefOrZero(M) && "Unknown shuffle sentinel value!")(static_cast <bool> (isUndefOrZero(M) && "Unknown shuffle sentinel value!" ) ? void (0) : __assert_fail ("isUndefOrZero(M) && \"Unknown shuffle sentinel value!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8220, __extension__ __PRETTY_FUNCTION__)); | |||
| 8221 | if (SM_SentinelUndef == M) | |||
| 8222 | KnownUndef.setBit(i); | |||
| 8223 | if (SM_SentinelZero == M) | |||
| 8224 | KnownZero.setBit(i); | |||
| 8225 | continue; | |||
| 8226 | } | |||
| 8227 | ||||
| 8228 | // Determine shuffle input and normalize the mask. | |||
| 8229 | unsigned SrcIdx = M / Size; | |||
| 8230 | SDValue V = M < Size ? V1 : V2; | |||
| 8231 | M %= Size; | |||
| 8232 | ||||
| 8233 | // We are referencing an UNDEF input. | |||
| 8234 | if (V.isUndef()) { | |||
| 8235 | KnownUndef.setBit(i); | |||
| 8236 | continue; | |||
| 8237 | } | |||
| 8238 | ||||
| 8239 | // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF. | |||
| 8240 | // TODO: We currently only set UNDEF for integer types - floats use the same | |||
| 8241 | // registers as vectors and many of the scalar folded loads rely on the | |||
| 8242 | // SCALAR_TO_VECTOR pattern. | |||
| 8243 | if (V.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 8244 | (Size % V.getValueType().getVectorNumElements()) == 0) { | |||
| 8245 | int Scale = Size / V.getValueType().getVectorNumElements(); | |||
| 8246 | int Idx = M / Scale; | |||
| 8247 | if (Idx != 0 && !VT.isFloatingPoint()) | |||
| 8248 | KnownUndef.setBit(i); | |||
| 8249 | else if (Idx == 0 && X86::isZeroNode(V.getOperand(0))) | |||
| 8250 | KnownZero.setBit(i); | |||
| 8251 | continue; | |||
| 8252 | } | |||
| 8253 | ||||
| 8254 | // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF | |||
| 8255 | // base vectors. | |||
| 8256 | if (V.getOpcode() == ISD::INSERT_SUBVECTOR) { | |||
| 8257 | SDValue Vec = V.getOperand(0); | |||
| 8258 | int NumVecElts = Vec.getValueType().getVectorNumElements(); | |||
| 8259 | if (Vec.isUndef() && Size == NumVecElts) { | |||
| 8260 | int Idx = V.getConstantOperandVal(2); | |||
| 8261 | int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements(); | |||
| 8262 | if (M < Idx || (Idx + NumSubElts) <= M) | |||
| 8263 | KnownUndef.setBit(i); | |||
| 8264 | } | |||
| 8265 | continue; | |||
| 8266 | } | |||
| 8267 | ||||
| 8268 | // Attempt to extract from the source's constant bits. | |||
| 8269 | if (IsSrcConstant[SrcIdx]) { | |||
| 8270 | if (UndefSrcElts[SrcIdx][M]) | |||
| 8271 | KnownUndef.setBit(i); | |||
| 8272 | else if (SrcEltBits[SrcIdx][M] == 0) | |||
| 8273 | KnownZero.setBit(i); | |||
| 8274 | } | |||
| 8275 | } | |||
| 8276 | ||||
| 8277 | assert(VT.getVectorNumElements() == (unsigned)Size &&(static_cast <bool> (VT.getVectorNumElements() == (unsigned )Size && "Different mask size from vector size!") ? void (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__ __PRETTY_FUNCTION__)) | |||
| 8278 | "Different mask size from vector size!")(static_cast <bool> (VT.getVectorNumElements() == (unsigned )Size && "Different mask size from vector size!") ? void (0) : __assert_fail ("VT.getVectorNumElements() == (unsigned)Size && \"Different mask size from vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8278, __extension__ __PRETTY_FUNCTION__)); | |||
| 8279 | return true; | |||
| 8280 | } | |||
| 8281 | ||||
| 8282 | // Replace target shuffle mask elements with known undef/zero sentinels. | |||
| 8283 | static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask, | |||
| 8284 | const APInt &KnownUndef, | |||
| 8285 | const APInt &KnownZero, | |||
| 8286 | bool ResolveKnownZeros= true) { | |||
| 8287 | unsigned NumElts = Mask.size(); | |||
| 8288 | assert(KnownUndef.getBitWidth() == NumElts &&(static_cast <bool> (KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch" ) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__ __PRETTY_FUNCTION__)) | |||
| 8289 | KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch")(static_cast <bool> (KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch" ) ? void (0) : __assert_fail ("KnownUndef.getBitWidth() == NumElts && KnownZero.getBitWidth() == NumElts && \"Shuffle mask size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8289, __extension__ __PRETTY_FUNCTION__)); | |||
| 8290 | ||||
| 8291 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 8292 | if (KnownUndef[i]) | |||
| 8293 | Mask[i] = SM_SentinelUndef; | |||
| 8294 | else if (ResolveKnownZeros && KnownZero[i]) | |||
| 8295 | Mask[i] = SM_SentinelZero; | |||
| 8296 | } | |||
| 8297 | } | |||
| 8298 | ||||
| 8299 | // Extract target shuffle mask sentinel elements to known undef/zero bitmasks. | |||
| 8300 | static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask, | |||
| 8301 | APInt &KnownUndef, | |||
| 8302 | APInt &KnownZero) { | |||
| 8303 | unsigned NumElts = Mask.size(); | |||
| 8304 | KnownUndef = KnownZero = APInt::getZero(NumElts); | |||
| 8305 | ||||
| 8306 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 8307 | int M = Mask[i]; | |||
| 8308 | if (SM_SentinelUndef == M) | |||
| 8309 | KnownUndef.setBit(i); | |||
| 8310 | if (SM_SentinelZero == M) | |||
| 8311 | KnownZero.setBit(i); | |||
| 8312 | } | |||
| 8313 | } | |||
| 8314 | ||||
| 8315 | // Attempt to create a shuffle mask from a VSELECT/BLENDV condition mask. | |||
| 8316 | static bool createShuffleMaskFromVSELECT(SmallVectorImpl<int> &Mask, | |||
| 8317 | SDValue Cond, bool IsBLENDV = false) { | |||
| 8318 | EVT CondVT = Cond.getValueType(); | |||
| 8319 | unsigned EltSizeInBits = CondVT.getScalarSizeInBits(); | |||
| 8320 | unsigned NumElts = CondVT.getVectorNumElements(); | |||
| 8321 | ||||
| 8322 | APInt UndefElts; | |||
| 8323 | SmallVector<APInt, 32> EltBits; | |||
| 8324 | if (!getTargetConstantBitsFromNode(Cond, EltSizeInBits, UndefElts, EltBits, | |||
| 8325 | true, false)) | |||
| 8326 | return false; | |||
| 8327 | ||||
| 8328 | Mask.resize(NumElts, SM_SentinelUndef); | |||
| 8329 | ||||
| 8330 | for (int i = 0; i != (int)NumElts; ++i) { | |||
| 8331 | Mask[i] = i; | |||
| 8332 | // Arbitrarily choose from the 2nd operand if the select condition element | |||
| 8333 | // is undef. | |||
| 8334 | // TODO: Can we do better by matching patterns such as even/odd? | |||
| 8335 | if (UndefElts[i] || (!IsBLENDV && EltBits[i].isZero()) || | |||
| 8336 | (IsBLENDV && EltBits[i].isNonNegative())) | |||
| 8337 | Mask[i] += NumElts; | |||
| 8338 | } | |||
| 8339 | ||||
| 8340 | return true; | |||
| 8341 | } | |||
| 8342 | ||||
| 8343 | // Forward declaration (for getFauxShuffleMask recursive check). | |||
| 8344 | static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, | |||
| 8345 | SmallVectorImpl<SDValue> &Inputs, | |||
| 8346 | SmallVectorImpl<int> &Mask, | |||
| 8347 | const SelectionDAG &DAG, unsigned Depth, | |||
| 8348 | bool ResolveKnownElts); | |||
| 8349 | ||||
| 8350 | // Attempt to decode ops that could be represented as a shuffle mask. | |||
| 8351 | // The decoded shuffle mask may contain a different number of elements to the | |||
| 8352 | // destination value type. | |||
| 8353 | // TODO: Merge into getTargetShuffleInputs() | |||
| 8354 | static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, | |||
| 8355 | SmallVectorImpl<int> &Mask, | |||
| 8356 | SmallVectorImpl<SDValue> &Ops, | |||
| 8357 | const SelectionDAG &DAG, unsigned Depth, | |||
| 8358 | bool ResolveKnownElts) { | |||
| 8359 | Mask.clear(); | |||
| 8360 | Ops.clear(); | |||
| 8361 | ||||
| 8362 | MVT VT = N.getSimpleValueType(); | |||
| 8363 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 8364 | unsigned NumSizeInBits = VT.getSizeInBits(); | |||
| 8365 | unsigned NumBitsPerElt = VT.getScalarSizeInBits(); | |||
| 8366 | if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0) | |||
| 8367 | return false; | |||
| 8368 | assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size")(static_cast <bool> (NumElts == DemandedElts.getBitWidth () && "Unexpected vector size") ? void (0) : __assert_fail ("NumElts == DemandedElts.getBitWidth() && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8368, __extension__ __PRETTY_FUNCTION__)); | |||
| 8369 | unsigned NumSizeInBytes = NumSizeInBits / 8; | |||
| 8370 | unsigned NumBytesPerElt = NumBitsPerElt / 8; | |||
| 8371 | ||||
| 8372 | unsigned Opcode = N.getOpcode(); | |||
| 8373 | switch (Opcode) { | |||
| 8374 | case ISD::VECTOR_SHUFFLE: { | |||
| 8375 | // Don't treat ISD::VECTOR_SHUFFLE as a target shuffle so decode it here. | |||
| 8376 | ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(N)->getMask(); | |||
| 8377 | if (isUndefOrInRange(ShuffleMask, 0, 2 * NumElts)) { | |||
| 8378 | Mask.append(ShuffleMask.begin(), ShuffleMask.end()); | |||
| 8379 | Ops.push_back(N.getOperand(0)); | |||
| 8380 | Ops.push_back(N.getOperand(1)); | |||
| 8381 | return true; | |||
| 8382 | } | |||
| 8383 | return false; | |||
| 8384 | } | |||
| 8385 | case ISD::AND: | |||
| 8386 | case X86ISD::ANDNP: { | |||
| 8387 | // Attempt to decode as a per-byte mask. | |||
| 8388 | APInt UndefElts; | |||
| 8389 | SmallVector<APInt, 32> EltBits; | |||
| 8390 | SDValue N0 = N.getOperand(0); | |||
| 8391 | SDValue N1 = N.getOperand(1); | |||
| 8392 | bool IsAndN = (X86ISD::ANDNP == Opcode); | |||
| 8393 | uint64_t ZeroMask = IsAndN ? 255 : 0; | |||
| 8394 | if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits)) | |||
| 8395 | return false; | |||
| 8396 | // We can't assume an undef src element gives an undef dst - the other src | |||
| 8397 | // might be zero. | |||
| 8398 | if (!UndefElts.isZero()) | |||
| 8399 | return false; | |||
| 8400 | for (int i = 0, e = (int)EltBits.size(); i != e; ++i) { | |||
| 8401 | const APInt &ByteBits = EltBits[i]; | |||
| 8402 | if (ByteBits != 0 && ByteBits != 255) | |||
| 8403 | return false; | |||
| 8404 | Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i); | |||
| 8405 | } | |||
| 8406 | Ops.push_back(IsAndN ? N1 : N0); | |||
| 8407 | return true; | |||
| 8408 | } | |||
| 8409 | case ISD::OR: { | |||
| 8410 | // Handle OR(SHUFFLE,SHUFFLE) case where one source is zero and the other | |||
| 8411 | // is a valid shuffle index. | |||
| 8412 | SDValue N0 = peekThroughBitcasts(N.getOperand(0)); | |||
| 8413 | SDValue N1 = peekThroughBitcasts(N.getOperand(1)); | |||
| 8414 | if (!N0.getValueType().isVector() || !N1.getValueType().isVector()) | |||
| 8415 | return false; | |||
| 8416 | ||||
| 8417 | SmallVector<int, 64> SrcMask0, SrcMask1; | |||
| 8418 | SmallVector<SDValue, 2> SrcInputs0, SrcInputs1; | |||
| 8419 | APInt Demand0 = APInt::getAllOnes(N0.getValueType().getVectorNumElements()); | |||
| 8420 | APInt Demand1 = APInt::getAllOnes(N1.getValueType().getVectorNumElements()); | |||
| 8421 | if (!getTargetShuffleInputs(N0, Demand0, SrcInputs0, SrcMask0, DAG, | |||
| 8422 | Depth + 1, true) || | |||
| 8423 | !getTargetShuffleInputs(N1, Demand1, SrcInputs1, SrcMask1, DAG, | |||
| 8424 | Depth + 1, true)) | |||
| 8425 | return false; | |||
| 8426 | ||||
| 8427 | size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size()); | |||
| 8428 | SmallVector<int, 64> Mask0, Mask1; | |||
| 8429 | narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0); | |||
| 8430 | narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1); | |||
| 8431 | for (int i = 0; i != (int)MaskSize; ++i) { | |||
| 8432 | // NOTE: Don't handle SM_SentinelUndef, as we can end up in infinite | |||
| 8433 | // loops converting between OR and BLEND shuffles due to | |||
| 8434 | // canWidenShuffleElements merging away undef elements, meaning we | |||
| 8435 | // fail to recognise the OR as the undef element isn't known zero. | |||
| 8436 | if (Mask0[i] == SM_SentinelZero && Mask1[i] == SM_SentinelZero) | |||
| 8437 | Mask.push_back(SM_SentinelZero); | |||
| 8438 | else if (Mask1[i] == SM_SentinelZero) | |||
| 8439 | Mask.push_back(i); | |||
| 8440 | else if (Mask0[i] == SM_SentinelZero) | |||
| 8441 | Mask.push_back(i + MaskSize); | |||
| 8442 | else | |||
| 8443 | return false; | |||
| 8444 | } | |||
| 8445 | Ops.push_back(N0); | |||
| 8446 | Ops.push_back(N1); | |||
| 8447 | return true; | |||
| 8448 | } | |||
| 8449 | case ISD::INSERT_SUBVECTOR: { | |||
| 8450 | SDValue Src = N.getOperand(0); | |||
| 8451 | SDValue Sub = N.getOperand(1); | |||
| 8452 | EVT SubVT = Sub.getValueType(); | |||
| 8453 | unsigned NumSubElts = SubVT.getVectorNumElements(); | |||
| 8454 | if (!N->isOnlyUserOf(Sub.getNode())) | |||
| 8455 | return false; | |||
| 8456 | uint64_t InsertIdx = N.getConstantOperandVal(2); | |||
| 8457 | // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)). | |||
| 8458 | if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 8459 | Sub.getOperand(0).getValueType() == VT) { | |||
| 8460 | uint64_t ExtractIdx = Sub.getConstantOperandVal(1); | |||
| 8461 | for (int i = 0; i != (int)NumElts; ++i) | |||
| 8462 | Mask.push_back(i); | |||
| 8463 | for (int i = 0; i != (int)NumSubElts; ++i) | |||
| 8464 | Mask[InsertIdx + i] = NumElts + ExtractIdx + i; | |||
| 8465 | Ops.push_back(Src); | |||
| 8466 | Ops.push_back(Sub.getOperand(0)); | |||
| 8467 | return true; | |||
| 8468 | } | |||
| 8469 | // Handle INSERT_SUBVECTOR(SRC0, SHUFFLE(SRC1)). | |||
| 8470 | SmallVector<int, 64> SubMask; | |||
| 8471 | SmallVector<SDValue, 2> SubInputs; | |||
| 8472 | SDValue SubSrc = peekThroughOneUseBitcasts(Sub); | |||
| 8473 | EVT SubSrcVT = SubSrc.getValueType(); | |||
| 8474 | if (!SubSrcVT.isVector()) | |||
| 8475 | return false; | |||
| 8476 | ||||
| 8477 | APInt SubDemand = APInt::getAllOnes(SubSrcVT.getVectorNumElements()); | |||
| 8478 | if (!getTargetShuffleInputs(SubSrc, SubDemand, SubInputs, SubMask, DAG, | |||
| 8479 | Depth + 1, ResolveKnownElts)) | |||
| 8480 | return false; | |||
| 8481 | ||||
| 8482 | // Subvector shuffle inputs must not be larger than the subvector. | |||
| 8483 | if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) { | |||
| 8484 | return SubVT.getFixedSizeInBits() < | |||
| 8485 | SubInput.getValueSizeInBits().getFixedValue(); | |||
| 8486 | })) | |||
| 8487 | return false; | |||
| 8488 | ||||
| 8489 | if (SubMask.size() != NumSubElts) { | |||
| 8490 | assert(((SubMask.size() % NumSubElts) == 0 ||(static_cast <bool> (((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale" ) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__ __PRETTY_FUNCTION__)) | |||
| 8491 | (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale")(static_cast <bool> (((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale" ) ? void (0) : __assert_fail ("((SubMask.size() % NumSubElts) == 0 || (NumSubElts % SubMask.size()) == 0) && \"Illegal submask scale\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8491, __extension__ __PRETTY_FUNCTION__)); | |||
| 8492 | if ((NumSubElts % SubMask.size()) == 0) { | |||
| 8493 | int Scale = NumSubElts / SubMask.size(); | |||
| 8494 | SmallVector<int,64> ScaledSubMask; | |||
| 8495 | narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask); | |||
| 8496 | SubMask = ScaledSubMask; | |||
| 8497 | } else { | |||
| 8498 | int Scale = SubMask.size() / NumSubElts; | |||
| 8499 | NumSubElts = SubMask.size(); | |||
| 8500 | NumElts *= Scale; | |||
| 8501 | InsertIdx *= Scale; | |||
| 8502 | } | |||
| 8503 | } | |||
| 8504 | Ops.push_back(Src); | |||
| 8505 | Ops.append(SubInputs.begin(), SubInputs.end()); | |||
| 8506 | if (ISD::isBuildVectorAllZeros(Src.getNode())) | |||
| 8507 | Mask.append(NumElts, SM_SentinelZero); | |||
| 8508 | else | |||
| 8509 | for (int i = 0; i != (int)NumElts; ++i) | |||
| 8510 | Mask.push_back(i); | |||
| 8511 | for (int i = 0; i != (int)NumSubElts; ++i) { | |||
| 8512 | int M = SubMask[i]; | |||
| 8513 | if (0 <= M) { | |||
| 8514 | int InputIdx = M / NumSubElts; | |||
| 8515 | M = (NumElts * (1 + InputIdx)) + (M % NumSubElts); | |||
| 8516 | } | |||
| 8517 | Mask[i + InsertIdx] = M; | |||
| 8518 | } | |||
| 8519 | return true; | |||
| 8520 | } | |||
| 8521 | case X86ISD::PINSRB: | |||
| 8522 | case X86ISD::PINSRW: | |||
| 8523 | case ISD::SCALAR_TO_VECTOR: | |||
| 8524 | case ISD::INSERT_VECTOR_ELT: { | |||
| 8525 | // Match against a insert_vector_elt/scalar_to_vector of an extract from a | |||
| 8526 | // vector, for matching src/dst vector types. | |||
| 8527 | SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1); | |||
| 8528 | ||||
| 8529 | unsigned DstIdx = 0; | |||
| 8530 | if (Opcode != ISD::SCALAR_TO_VECTOR) { | |||
| 8531 | // Check we have an in-range constant insertion index. | |||
| 8532 | if (!isa<ConstantSDNode>(N.getOperand(2)) || | |||
| 8533 | N.getConstantOperandAPInt(2).uge(NumElts)) | |||
| 8534 | return false; | |||
| 8535 | DstIdx = N.getConstantOperandVal(2); | |||
| 8536 | ||||
| 8537 | // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern. | |||
| 8538 | if (X86::isZeroNode(Scl)) { | |||
| 8539 | Ops.push_back(N.getOperand(0)); | |||
| 8540 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 8541 | Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i); | |||
| 8542 | return true; | |||
| 8543 | } | |||
| 8544 | } | |||
| 8545 | ||||
| 8546 | // Peek through trunc/aext/zext. | |||
| 8547 | // TODO: aext shouldn't require SM_SentinelZero padding. | |||
| 8548 | // TODO: handle shift of scalars. | |||
| 8549 | unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits(); | |||
| 8550 | while (Scl.getOpcode() == ISD::TRUNCATE || | |||
| 8551 | Scl.getOpcode() == ISD::ANY_EXTEND || | |||
| 8552 | Scl.getOpcode() == ISD::ZERO_EXTEND) { | |||
| 8553 | Scl = Scl.getOperand(0); | |||
| 8554 | MinBitsPerElt = | |||
| 8555 | std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits()); | |||
| 8556 | } | |||
| 8557 | if ((MinBitsPerElt % 8) != 0) | |||
| 8558 | return false; | |||
| 8559 | ||||
| 8560 | // Attempt to find the source vector the scalar was extracted from. | |||
| 8561 | SDValue SrcExtract; | |||
| 8562 | if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT || | |||
| 8563 | Scl.getOpcode() == X86ISD::PEXTRW || | |||
| 8564 | Scl.getOpcode() == X86ISD::PEXTRB) && | |||
| 8565 | Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) { | |||
| 8566 | SrcExtract = Scl; | |||
| 8567 | } | |||
| 8568 | if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) | |||
| 8569 | return false; | |||
| 8570 | ||||
| 8571 | SDValue SrcVec = SrcExtract.getOperand(0); | |||
| 8572 | EVT SrcVT = SrcVec.getValueType(); | |||
| 8573 | if (!SrcVT.getScalarType().isByteSized()) | |||
| 8574 | return false; | |||
| 8575 | unsigned SrcIdx = SrcExtract.getConstantOperandVal(1); | |||
| 8576 | unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8); | |||
| 8577 | unsigned DstByte = DstIdx * NumBytesPerElt; | |||
| 8578 | MinBitsPerElt = | |||
| 8579 | std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits()); | |||
| 8580 | ||||
| 8581 | // Create 'identity' byte level shuffle mask and then add inserted bytes. | |||
| 8582 | if (Opcode == ISD::SCALAR_TO_VECTOR) { | |||
| 8583 | Ops.push_back(SrcVec); | |||
| 8584 | Mask.append(NumSizeInBytes, SM_SentinelUndef); | |||
| 8585 | } else { | |||
| 8586 | Ops.push_back(SrcVec); | |||
| 8587 | Ops.push_back(N.getOperand(0)); | |||
| 8588 | for (int i = 0; i != (int)NumSizeInBytes; ++i) | |||
| 8589 | Mask.push_back(NumSizeInBytes + i); | |||
| 8590 | } | |||
| 8591 | ||||
| 8592 | unsigned MinBytesPerElts = MinBitsPerElt / 8; | |||
| 8593 | MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt); | |||
| 8594 | for (unsigned i = 0; i != MinBytesPerElts; ++i) | |||
| 8595 | Mask[DstByte + i] = SrcByte + i; | |||
| 8596 | for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i) | |||
| 8597 | Mask[DstByte + i] = SM_SentinelZero; | |||
| 8598 | return true; | |||
| 8599 | } | |||
| 8600 | case X86ISD::PACKSS: | |||
| 8601 | case X86ISD::PACKUS: { | |||
| 8602 | SDValue N0 = N.getOperand(0); | |||
| 8603 | SDValue N1 = N.getOperand(1); | |||
| 8604 | assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements () == (NumElts / 2) && N1.getValueType().getVectorNumElements () == (NumElts / 2) && "Unexpected input value type") ? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__ __PRETTY_FUNCTION__)) | |||
| 8605 | N1.getValueType().getVectorNumElements() == (NumElts / 2) &&(static_cast <bool> (N0.getValueType().getVectorNumElements () == (NumElts / 2) && N1.getValueType().getVectorNumElements () == (NumElts / 2) && "Unexpected input value type") ? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__ __PRETTY_FUNCTION__)) | |||
| 8606 | "Unexpected input value type")(static_cast <bool> (N0.getValueType().getVectorNumElements () == (NumElts / 2) && N1.getValueType().getVectorNumElements () == (NumElts / 2) && "Unexpected input value type") ? void (0) : __assert_fail ("N0.getValueType().getVectorNumElements() == (NumElts / 2) && N1.getValueType().getVectorNumElements() == (NumElts / 2) && \"Unexpected input value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8606, __extension__ __PRETTY_FUNCTION__)); | |||
| 8607 | ||||
| 8608 | APInt EltsLHS, EltsRHS; | |||
| 8609 | getPackDemandedElts(VT, DemandedElts, EltsLHS, EltsRHS); | |||
| 8610 | ||||
| 8611 | // If we know input saturation won't happen (or we don't care for particular | |||
| 8612 | // lanes), we can treat this as a truncation shuffle. | |||
| 8613 | bool Offset0 = false, Offset1 = false; | |||
| 8614 | if (Opcode == X86ISD::PACKSS) { | |||
| 8615 | if ((!(N0.isUndef() || EltsLHS.isZero()) && | |||
| 8616 | DAG.ComputeNumSignBits(N0, EltsLHS, Depth + 1) <= NumBitsPerElt) || | |||
| 8617 | (!(N1.isUndef() || EltsRHS.isZero()) && | |||
| 8618 | DAG.ComputeNumSignBits(N1, EltsRHS, Depth + 1) <= NumBitsPerElt)) | |||
| 8619 | return false; | |||
| 8620 | // We can't easily fold ASHR into a shuffle, but if it was feeding a | |||
| 8621 | // PACKSS then it was likely being used for sign-extension for a | |||
| 8622 | // truncation, so just peek through and adjust the mask accordingly. | |||
| 8623 | if (N0.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N0.getNode()) && | |||
| 8624 | N0.getConstantOperandAPInt(1) == NumBitsPerElt) { | |||
| 8625 | Offset0 = true; | |||
| 8626 | N0 = N0.getOperand(0); | |||
| 8627 | } | |||
| 8628 | if (N1.getOpcode() == X86ISD::VSRAI && N->isOnlyUserOf(N1.getNode()) && | |||
| 8629 | N1.getConstantOperandAPInt(1) == NumBitsPerElt) { | |||
| 8630 | Offset1 = true; | |||
| 8631 | N1 = N1.getOperand(0); | |||
| 8632 | } | |||
| 8633 | } else { | |||
| 8634 | APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); | |||
| 8635 | if ((!(N0.isUndef() || EltsLHS.isZero()) && | |||
| 8636 | !DAG.MaskedValueIsZero(N0, ZeroMask, EltsLHS, Depth + 1)) || | |||
| 8637 | (!(N1.isUndef() || EltsRHS.isZero()) && | |||
| 8638 | !DAG.MaskedValueIsZero(N1, ZeroMask, EltsRHS, Depth + 1))) | |||
| 8639 | return false; | |||
| 8640 | } | |||
| 8641 | ||||
| 8642 | bool IsUnary = (N0 == N1); | |||
| 8643 | ||||
| 8644 | Ops.push_back(N0); | |||
| 8645 | if (!IsUnary) | |||
| 8646 | Ops.push_back(N1); | |||
| 8647 | ||||
| 8648 | createPackShuffleMask(VT, Mask, IsUnary); | |||
| 8649 | ||||
| 8650 | if (Offset0 || Offset1) { | |||
| 8651 | for (int &M : Mask) | |||
| 8652 | if ((Offset0 && isInRange(M, 0, NumElts)) || | |||
| 8653 | (Offset1 && isInRange(M, NumElts, 2 * NumElts))) | |||
| 8654 | ++M; | |||
| 8655 | } | |||
| 8656 | return true; | |||
| 8657 | } | |||
| 8658 | case ISD::VSELECT: | |||
| 8659 | case X86ISD::BLENDV: { | |||
| 8660 | SDValue Cond = N.getOperand(0); | |||
| 8661 | if (createShuffleMaskFromVSELECT(Mask, Cond, Opcode == X86ISD::BLENDV)) { | |||
| 8662 | Ops.push_back(N.getOperand(1)); | |||
| 8663 | Ops.push_back(N.getOperand(2)); | |||
| 8664 | return true; | |||
| 8665 | } | |||
| 8666 | return false; | |||
| 8667 | } | |||
| 8668 | case X86ISD::VTRUNC: { | |||
| 8669 | SDValue Src = N.getOperand(0); | |||
| 8670 | EVT SrcVT = Src.getValueType(); | |||
| 8671 | // Truncated source must be a simple vector. | |||
| 8672 | if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || | |||
| 8673 | (SrcVT.getScalarSizeInBits() % 8) != 0) | |||
| 8674 | return false; | |||
| 8675 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 8676 | unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits(); | |||
| 8677 | unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt; | |||
| 8678 | assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation")(static_cast <bool> ((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation") ? void (0) : __assert_fail ("(NumBitsPerSrcElt % NumBitsPerElt) == 0 && \"Illegal truncation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8678, __extension__ __PRETTY_FUNCTION__)); | |||
| 8679 | for (unsigned i = 0; i != NumSrcElts; ++i) | |||
| 8680 | Mask.push_back(i * Scale); | |||
| 8681 | Mask.append(NumElts - NumSrcElts, SM_SentinelZero); | |||
| 8682 | Ops.push_back(Src); | |||
| 8683 | return true; | |||
| 8684 | } | |||
| 8685 | case X86ISD::VSHLI: | |||
| 8686 | case X86ISD::VSRLI: { | |||
| 8687 | uint64_t ShiftVal = N.getConstantOperandVal(1); | |||
| 8688 | // Out of range bit shifts are guaranteed to be zero. | |||
| 8689 | if (NumBitsPerElt <= ShiftVal) { | |||
| 8690 | Mask.append(NumElts, SM_SentinelZero); | |||
| 8691 | return true; | |||
| 8692 | } | |||
| 8693 | ||||
| 8694 | // We can only decode 'whole byte' bit shifts as shuffles. | |||
| 8695 | if ((ShiftVal % 8) != 0) | |||
| 8696 | break; | |||
| 8697 | ||||
| 8698 | uint64_t ByteShift = ShiftVal / 8; | |||
| 8699 | Ops.push_back(N.getOperand(0)); | |||
| 8700 | ||||
| 8701 | // Clear mask to all zeros and insert the shifted byte indices. | |||
| 8702 | Mask.append(NumSizeInBytes, SM_SentinelZero); | |||
| 8703 | ||||
| 8704 | if (X86ISD::VSHLI == Opcode) { | |||
| 8705 | for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) | |||
| 8706 | for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) | |||
| 8707 | Mask[i + j] = i + j - ByteShift; | |||
| 8708 | } else { | |||
| 8709 | for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt) | |||
| 8710 | for (unsigned j = ByteShift; j != NumBytesPerElt; ++j) | |||
| 8711 | Mask[i + j - ByteShift] = i + j; | |||
| 8712 | } | |||
| 8713 | return true; | |||
| 8714 | } | |||
| 8715 | case X86ISD::VROTLI: | |||
| 8716 | case X86ISD::VROTRI: { | |||
| 8717 | // We can only decode 'whole byte' bit rotates as shuffles. | |||
| 8718 | uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt); | |||
| 8719 | if ((RotateVal % 8) != 0) | |||
| 8720 | return false; | |||
| 8721 | Ops.push_back(N.getOperand(0)); | |||
| 8722 | int Offset = RotateVal / 8; | |||
| 8723 | Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset); | |||
| 8724 | for (int i = 0; i != (int)NumElts; ++i) { | |||
| 8725 | int BaseIdx = i * NumBytesPerElt; | |||
| 8726 | for (int j = 0; j != (int)NumBytesPerElt; ++j) { | |||
| 8727 | Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt)); | |||
| 8728 | } | |||
| 8729 | } | |||
| 8730 | return true; | |||
| 8731 | } | |||
| 8732 | case X86ISD::VBROADCAST: { | |||
| 8733 | SDValue Src = N.getOperand(0); | |||
| 8734 | if (!Src.getSimpleValueType().isVector()) { | |||
| 8735 | if (Src.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 8736 | !isNullConstant(Src.getOperand(1)) || | |||
| 8737 | Src.getOperand(0).getValueType().getScalarType() != | |||
| 8738 | VT.getScalarType()) | |||
| 8739 | return false; | |||
| 8740 | Src = Src.getOperand(0); | |||
| 8741 | } | |||
| 8742 | Ops.push_back(Src); | |||
| 8743 | Mask.append(NumElts, 0); | |||
| 8744 | return true; | |||
| 8745 | } | |||
| 8746 | case ISD::ZERO_EXTEND: | |||
| 8747 | case ISD::ANY_EXTEND: | |||
| 8748 | case ISD::ZERO_EXTEND_VECTOR_INREG: | |||
| 8749 | case ISD::ANY_EXTEND_VECTOR_INREG: { | |||
| 8750 | SDValue Src = N.getOperand(0); | |||
| 8751 | EVT SrcVT = Src.getValueType(); | |||
| 8752 | ||||
| 8753 | // Extended source must be a simple vector. | |||
| 8754 | if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 || | |||
| 8755 | (SrcVT.getScalarSizeInBits() % 8) != 0) | |||
| 8756 | return false; | |||
| 8757 | ||||
| 8758 | bool IsAnyExtend = | |||
| 8759 | (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode); | |||
| 8760 | DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts, | |||
| 8761 | IsAnyExtend, Mask); | |||
| 8762 | Ops.push_back(Src); | |||
| 8763 | return true; | |||
| 8764 | } | |||
| 8765 | } | |||
| 8766 | ||||
| 8767 | return false; | |||
| 8768 | } | |||
| 8769 | ||||
| 8770 | /// Removes unused/repeated shuffle source inputs and adjusts the shuffle mask. | |||
| 8771 | static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, | |||
| 8772 | SmallVectorImpl<int> &Mask) { | |||
| 8773 | int MaskWidth = Mask.size(); | |||
| 8774 | SmallVector<SDValue, 16> UsedInputs; | |||
| 8775 | for (int i = 0, e = Inputs.size(); i < e; ++i) { | |||
| 8776 | int lo = UsedInputs.size() * MaskWidth; | |||
| 8777 | int hi = lo + MaskWidth; | |||
| 8778 | ||||
| 8779 | // Strip UNDEF input usage. | |||
| 8780 | if (Inputs[i].isUndef()) | |||
| 8781 | for (int &M : Mask) | |||
| 8782 | if ((lo <= M) && (M < hi)) | |||
| 8783 | M = SM_SentinelUndef; | |||
| 8784 | ||||
| 8785 | // Check for unused inputs. | |||
| 8786 | if (none_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { | |||
| 8787 | for (int &M : Mask) | |||
| 8788 | if (lo <= M) | |||
| 8789 | M -= MaskWidth; | |||
| 8790 | continue; | |||
| 8791 | } | |||
| 8792 | ||||
| 8793 | // Check for repeated inputs. | |||
| 8794 | bool IsRepeat = false; | |||
| 8795 | for (int j = 0, ue = UsedInputs.size(); j != ue; ++j) { | |||
| 8796 | if (UsedInputs[j] != Inputs[i]) | |||
| 8797 | continue; | |||
| 8798 | for (int &M : Mask) | |||
| 8799 | if (lo <= M) | |||
| 8800 | M = (M < hi) ? ((M - lo) + (j * MaskWidth)) : (M - MaskWidth); | |||
| 8801 | IsRepeat = true; | |||
| 8802 | break; | |||
| 8803 | } | |||
| 8804 | if (IsRepeat) | |||
| 8805 | continue; | |||
| 8806 | ||||
| 8807 | UsedInputs.push_back(Inputs[i]); | |||
| 8808 | } | |||
| 8809 | Inputs = UsedInputs; | |||
| 8810 | } | |||
| 8811 | ||||
| 8812 | /// Calls getTargetShuffleAndZeroables to resolve a target shuffle mask's inputs | |||
| 8813 | /// and then sets the SM_SentinelUndef and SM_SentinelZero values. | |||
| 8814 | /// Returns true if the target shuffle mask was decoded. | |||
| 8815 | static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, | |||
| 8816 | SmallVectorImpl<SDValue> &Inputs, | |||
| 8817 | SmallVectorImpl<int> &Mask, | |||
| 8818 | APInt &KnownUndef, APInt &KnownZero, | |||
| 8819 | const SelectionDAG &DAG, unsigned Depth, | |||
| 8820 | bool ResolveKnownElts) { | |||
| 8821 | if (Depth >= SelectionDAG::MaxRecursionDepth) | |||
| 8822 | return false; // Limit search depth. | |||
| 8823 | ||||
| 8824 | EVT VT = Op.getValueType(); | |||
| 8825 | if (!VT.isSimple() || !VT.isVector()) | |||
| 8826 | return false; | |||
| 8827 | ||||
| 8828 | if (getTargetShuffleAndZeroables(Op, Mask, Inputs, KnownUndef, KnownZero)) { | |||
| 8829 | if (ResolveKnownElts) | |||
| 8830 | resolveTargetShuffleFromZeroables(Mask, KnownUndef, KnownZero); | |||
| 8831 | return true; | |||
| 8832 | } | |||
| 8833 | if (getFauxShuffleMask(Op, DemandedElts, Mask, Inputs, DAG, Depth, | |||
| 8834 | ResolveKnownElts)) { | |||
| 8835 | resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); | |||
| 8836 | return true; | |||
| 8837 | } | |||
| 8838 | return false; | |||
| 8839 | } | |||
| 8840 | ||||
| 8841 | static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts, | |||
| 8842 | SmallVectorImpl<SDValue> &Inputs, | |||
| 8843 | SmallVectorImpl<int> &Mask, | |||
| 8844 | const SelectionDAG &DAG, unsigned Depth, | |||
| 8845 | bool ResolveKnownElts) { | |||
| 8846 | APInt KnownUndef, KnownZero; | |||
| 8847 | return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, KnownUndef, | |||
| 8848 | KnownZero, DAG, Depth, ResolveKnownElts); | |||
| 8849 | } | |||
| 8850 | ||||
| 8851 | static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs, | |||
| 8852 | SmallVectorImpl<int> &Mask, | |||
| 8853 | const SelectionDAG &DAG, unsigned Depth = 0, | |||
| 8854 | bool ResolveKnownElts = true) { | |||
| 8855 | EVT VT = Op.getValueType(); | |||
| 8856 | if (!VT.isSimple() || !VT.isVector()) | |||
| 8857 | return false; | |||
| 8858 | ||||
| 8859 | unsigned NumElts = Op.getValueType().getVectorNumElements(); | |||
| 8860 | APInt DemandedElts = APInt::getAllOnes(NumElts); | |||
| 8861 | return getTargetShuffleInputs(Op, DemandedElts, Inputs, Mask, DAG, Depth, | |||
| 8862 | ResolveKnownElts); | |||
| 8863 | } | |||
| 8864 | ||||
| 8865 | // Attempt to create a scalar/subvector broadcast from the base MemSDNode. | |||
| 8866 | static SDValue getBROADCAST_LOAD(unsigned Opcode, const SDLoc &DL, EVT VT, | |||
| 8867 | EVT MemVT, MemSDNode *Mem, unsigned Offset, | |||
| 8868 | SelectionDAG &DAG) { | |||
| 8869 | assert((Opcode == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type" ) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__ __PRETTY_FUNCTION__)) | |||
| 8870 | Opcode == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type" ) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__ __PRETTY_FUNCTION__)) | |||
| 8871 | "Unknown broadcast load type")(static_cast <bool> ((Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type" ) ? void (0) : __assert_fail ("(Opcode == X86ISD::VBROADCAST_LOAD || Opcode == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8871, __extension__ __PRETTY_FUNCTION__)); | |||
| 8872 | ||||
| 8873 | // Ensure this is a simple (non-atomic, non-voltile), temporal read memop. | |||
| 8874 | if (!Mem || !Mem->readMem() || !Mem->isSimple() || Mem->isNonTemporal()) | |||
| 8875 | return SDValue(); | |||
| 8876 | ||||
| 8877 | SDValue Ptr = | |||
| 8878 | DAG.getMemBasePlusOffset(Mem->getBasePtr(), TypeSize::Fixed(Offset), DL); | |||
| 8879 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 8880 | SDValue Ops[] = {Mem->getChain(), Ptr}; | |||
| 8881 | SDValue BcstLd = DAG.getMemIntrinsicNode( | |||
| 8882 | Opcode, DL, Tys, Ops, MemVT, | |||
| 8883 | DAG.getMachineFunction().getMachineMemOperand( | |||
| 8884 | Mem->getMemOperand(), Offset, MemVT.getStoreSize())); | |||
| 8885 | DAG.makeEquivalentMemoryOrdering(SDValue(Mem, 1), BcstLd.getValue(1)); | |||
| 8886 | return BcstLd; | |||
| 8887 | } | |||
| 8888 | ||||
| 8889 | /// Returns the scalar element that will make up the i'th | |||
| 8890 | /// element of the result of the vector shuffle. | |||
| 8891 | static SDValue getShuffleScalarElt(SDValue Op, unsigned Index, | |||
| 8892 | SelectionDAG &DAG, unsigned Depth) { | |||
| 8893 | if (Depth >= SelectionDAG::MaxRecursionDepth) | |||
| 8894 | return SDValue(); // Limit search depth. | |||
| 8895 | ||||
| 8896 | EVT VT = Op.getValueType(); | |||
| 8897 | unsigned Opcode = Op.getOpcode(); | |||
| 8898 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 8899 | ||||
| 8900 | // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. | |||
| 8901 | if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) { | |||
| 8902 | int Elt = SV->getMaskElt(Index); | |||
| 8903 | ||||
| 8904 | if (Elt < 0) | |||
| 8905 | return DAG.getUNDEF(VT.getVectorElementType()); | |||
| 8906 | ||||
| 8907 | SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); | |||
| 8908 | return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); | |||
| 8909 | } | |||
| 8910 | ||||
| 8911 | // Recurse into target specific vector shuffles to find scalars. | |||
| 8912 | if (isTargetShuffle(Opcode)) { | |||
| 8913 | MVT ShufVT = VT.getSimpleVT(); | |||
| 8914 | MVT ShufSVT = ShufVT.getVectorElementType(); | |||
| 8915 | int NumElems = (int)ShufVT.getVectorNumElements(); | |||
| 8916 | SmallVector<int, 16> ShuffleMask; | |||
| 8917 | SmallVector<SDValue, 16> ShuffleOps; | |||
| 8918 | if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps, | |||
| 8919 | ShuffleMask)) | |||
| 8920 | return SDValue(); | |||
| 8921 | ||||
| 8922 | int Elt = ShuffleMask[Index]; | |||
| 8923 | if (Elt == SM_SentinelZero) | |||
| 8924 | return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT) | |||
| 8925 | : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT); | |||
| 8926 | if (Elt == SM_SentinelUndef) | |||
| 8927 | return DAG.getUNDEF(ShufSVT); | |||
| 8928 | ||||
| 8929 | assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range")(static_cast <bool> (0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range") ? void ( 0) : __assert_fail ("0 <= Elt && Elt < (2 * NumElems) && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 8929, __extension__ __PRETTY_FUNCTION__)); | |||
| 8930 | SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; | |||
| 8931 | return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1); | |||
| 8932 | } | |||
| 8933 | ||||
| 8934 | // Recurse into insert_subvector base/sub vector to find scalars. | |||
| 8935 | if (Opcode == ISD::INSERT_SUBVECTOR) { | |||
| 8936 | SDValue Vec = Op.getOperand(0); | |||
| 8937 | SDValue Sub = Op.getOperand(1); | |||
| 8938 | uint64_t SubIdx = Op.getConstantOperandVal(2); | |||
| 8939 | unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); | |||
| 8940 | ||||
| 8941 | if (SubIdx <= Index && Index < (SubIdx + NumSubElts)) | |||
| 8942 | return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1); | |||
| 8943 | return getShuffleScalarElt(Vec, Index, DAG, Depth + 1); | |||
| 8944 | } | |||
| 8945 | ||||
| 8946 | // Recurse into concat_vectors sub vector to find scalars. | |||
| 8947 | if (Opcode == ISD::CONCAT_VECTORS) { | |||
| 8948 | EVT SubVT = Op.getOperand(0).getValueType(); | |||
| 8949 | unsigned NumSubElts = SubVT.getVectorNumElements(); | |||
| 8950 | uint64_t SubIdx = Index / NumSubElts; | |||
| 8951 | uint64_t SubElt = Index % NumSubElts; | |||
| 8952 | return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1); | |||
| 8953 | } | |||
| 8954 | ||||
| 8955 | // Recurse into extract_subvector src vector to find scalars. | |||
| 8956 | if (Opcode == ISD::EXTRACT_SUBVECTOR) { | |||
| 8957 | SDValue Src = Op.getOperand(0); | |||
| 8958 | uint64_t SrcIdx = Op.getConstantOperandVal(1); | |||
| 8959 | return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1); | |||
| 8960 | } | |||
| 8961 | ||||
| 8962 | // We only peek through bitcasts of the same vector width. | |||
| 8963 | if (Opcode == ISD::BITCAST) { | |||
| 8964 | SDValue Src = Op.getOperand(0); | |||
| 8965 | EVT SrcVT = Src.getValueType(); | |||
| 8966 | if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems) | |||
| 8967 | return getShuffleScalarElt(Src, Index, DAG, Depth + 1); | |||
| 8968 | return SDValue(); | |||
| 8969 | } | |||
| 8970 | ||||
| 8971 | // Actual nodes that may contain scalar elements | |||
| 8972 | ||||
| 8973 | // For insert_vector_elt - either return the index matching scalar or recurse | |||
| 8974 | // into the base vector. | |||
| 8975 | if (Opcode == ISD::INSERT_VECTOR_ELT && | |||
| 8976 | isa<ConstantSDNode>(Op.getOperand(2))) { | |||
| 8977 | if (Op.getConstantOperandAPInt(2) == Index) | |||
| 8978 | return Op.getOperand(1); | |||
| 8979 | return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1); | |||
| 8980 | } | |||
| 8981 | ||||
| 8982 | if (Opcode == ISD::SCALAR_TO_VECTOR) | |||
| 8983 | return (Index == 0) ? Op.getOperand(0) | |||
| 8984 | : DAG.getUNDEF(VT.getVectorElementType()); | |||
| 8985 | ||||
| 8986 | if (Opcode == ISD::BUILD_VECTOR) | |||
| 8987 | return Op.getOperand(Index); | |||
| 8988 | ||||
| 8989 | return SDValue(); | |||
| 8990 | } | |||
| 8991 | ||||
| 8992 | // Use PINSRB/PINSRW/PINSRD to create a build vector. | |||
| 8993 | static SDValue LowerBuildVectorAsInsert(SDValue Op, const APInt &NonZeroMask, | |||
| 8994 | unsigned NumNonZero, unsigned NumZero, | |||
| 8995 | SelectionDAG &DAG, | |||
| 8996 | const X86Subtarget &Subtarget) { | |||
| 8997 | MVT VT = Op.getSimpleValueType(); | |||
| 8998 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 8999 | assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||(static_cast <bool> (((VT == MVT::v8i16 && Subtarget .hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion" ) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__ __PRETTY_FUNCTION__)) | |||
| 9000 | ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&(static_cast <bool> (((VT == MVT::v8i16 && Subtarget .hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion" ) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__ __PRETTY_FUNCTION__)) | |||
| 9001 | "Illegal vector insertion")(static_cast <bool> (((VT == MVT::v8i16 && Subtarget .hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && "Illegal vector insertion" ) ? void (0) : __assert_fail ("((VT == MVT::v8i16 && Subtarget.hasSSE2()) || ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && \"Illegal vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9001, __extension__ __PRETTY_FUNCTION__)); | |||
| 9002 | ||||
| 9003 | SDLoc dl(Op); | |||
| 9004 | SDValue V; | |||
| 9005 | bool First = true; | |||
| 9006 | ||||
| 9007 | for (unsigned i = 0; i < NumElts; ++i) { | |||
| 9008 | bool IsNonZero = NonZeroMask[i]; | |||
| 9009 | if (!IsNonZero) | |||
| 9010 | continue; | |||
| 9011 | ||||
| 9012 | // If the build vector contains zeros or our first insertion is not the | |||
| 9013 | // first index then insert into zero vector to break any register | |||
| 9014 | // dependency else use SCALAR_TO_VECTOR. | |||
| 9015 | if (First) { | |||
| 9016 | First = false; | |||
| 9017 | if (NumZero || 0 != i) | |||
| 9018 | V = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 9019 | else { | |||
| 9020 | assert(0 == i && "Expected insertion into zero-index")(static_cast <bool> (0 == i && "Expected insertion into zero-index" ) ? void (0) : __assert_fail ("0 == i && \"Expected insertion into zero-index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9020, __extension__ __PRETTY_FUNCTION__)); | |||
| 9021 | V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); | |||
| 9022 | V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); | |||
| 9023 | V = DAG.getBitcast(VT, V); | |||
| 9024 | continue; | |||
| 9025 | } | |||
| 9026 | } | |||
| 9027 | V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), | |||
| 9028 | DAG.getIntPtrConstant(i, dl)); | |||
| 9029 | } | |||
| 9030 | ||||
| 9031 | return V; | |||
| 9032 | } | |||
| 9033 | ||||
| 9034 | /// Custom lower build_vector of v16i8. | |||
| 9035 | static SDValue LowerBuildVectorv16i8(SDValue Op, const APInt &NonZeroMask, | |||
| 9036 | unsigned NumNonZero, unsigned NumZero, | |||
| 9037 | SelectionDAG &DAG, | |||
| 9038 | const X86Subtarget &Subtarget) { | |||
| 9039 | if (NumNonZero > 8 && !Subtarget.hasSSE41()) | |||
| 9040 | return SDValue(); | |||
| 9041 | ||||
| 9042 | // SSE4.1 - use PINSRB to insert each byte directly. | |||
| 9043 | if (Subtarget.hasSSE41()) | |||
| 9044 | return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, | |||
| 9045 | Subtarget); | |||
| 9046 | ||||
| 9047 | SDLoc dl(Op); | |||
| 9048 | SDValue V; | |||
| 9049 | ||||
| 9050 | // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. | |||
| 9051 | for (unsigned i = 0; i < 16; i += 2) { | |||
| 9052 | bool ThisIsNonZero = NonZeroMask[i]; | |||
| 9053 | bool NextIsNonZero = NonZeroMask[i + 1]; | |||
| 9054 | if (!ThisIsNonZero && !NextIsNonZero) | |||
| 9055 | continue; | |||
| 9056 | ||||
| 9057 | // FIXME: Investigate combining the first 4 bytes as a i32 instead. | |||
| 9058 | SDValue Elt; | |||
| 9059 | if (ThisIsNonZero) { | |||
| 9060 | if (NumZero || NextIsNonZero) | |||
| 9061 | Elt = DAG.getZExtOrTrunc(Op.getOperand(i), dl, MVT::i32); | |||
| 9062 | else | |||
| 9063 | Elt = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); | |||
| 9064 | } | |||
| 9065 | ||||
| 9066 | if (NextIsNonZero) { | |||
| 9067 | SDValue NextElt = Op.getOperand(i + 1); | |||
| 9068 | if (i == 0 && NumZero) | |||
| 9069 | NextElt = DAG.getZExtOrTrunc(NextElt, dl, MVT::i32); | |||
| 9070 | else | |||
| 9071 | NextElt = DAG.getAnyExtOrTrunc(NextElt, dl, MVT::i32); | |||
| 9072 | NextElt = DAG.getNode(ISD::SHL, dl, MVT::i32, NextElt, | |||
| 9073 | DAG.getConstant(8, dl, MVT::i8)); | |||
| 9074 | if (ThisIsNonZero) | |||
| 9075 | Elt = DAG.getNode(ISD::OR, dl, MVT::i32, NextElt, Elt); | |||
| 9076 | else | |||
| 9077 | Elt = NextElt; | |||
| 9078 | } | |||
| 9079 | ||||
| 9080 | // If our first insertion is not the first index or zeros are needed, then | |||
| 9081 | // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high | |||
| 9082 | // elements undefined). | |||
| 9083 | if (!V) { | |||
| 9084 | if (i != 0 || NumZero) | |||
| 9085 | V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); | |||
| 9086 | else { | |||
| 9087 | V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt); | |||
| 9088 | V = DAG.getBitcast(MVT::v8i16, V); | |||
| 9089 | continue; | |||
| 9090 | } | |||
| 9091 | } | |||
| 9092 | Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Elt); | |||
| 9093 | V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Elt, | |||
| 9094 | DAG.getIntPtrConstant(i / 2, dl)); | |||
| 9095 | } | |||
| 9096 | ||||
| 9097 | return DAG.getBitcast(MVT::v16i8, V); | |||
| 9098 | } | |||
| 9099 | ||||
| 9100 | /// Custom lower build_vector of v8i16. | |||
| 9101 | static SDValue LowerBuildVectorv8i16(SDValue Op, const APInt &NonZeroMask, | |||
| 9102 | unsigned NumNonZero, unsigned NumZero, | |||
| 9103 | SelectionDAG &DAG, | |||
| 9104 | const X86Subtarget &Subtarget) { | |||
| 9105 | if (NumNonZero > 4 && !Subtarget.hasSSE41()) | |||
| 9106 | return SDValue(); | |||
| 9107 | ||||
| 9108 | // Use PINSRW to insert each byte directly. | |||
| 9109 | return LowerBuildVectorAsInsert(Op, NonZeroMask, NumNonZero, NumZero, DAG, | |||
| 9110 | Subtarget); | |||
| 9111 | } | |||
| 9112 | ||||
| 9113 | /// Custom lower build_vector of v4i32 or v4f32. | |||
| 9114 | static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG, | |||
| 9115 | const X86Subtarget &Subtarget) { | |||
| 9116 | // If this is a splat of a pair of elements, use MOVDDUP (unless the target | |||
| 9117 | // has XOP; in that case defer lowering to potentially use VPERMIL2PS). | |||
| 9118 | // Because we're creating a less complicated build vector here, we may enable | |||
| 9119 | // further folding of the MOVDDUP via shuffle transforms. | |||
| 9120 | if (Subtarget.hasSSE3() && !Subtarget.hasXOP() && | |||
| 9121 | Op.getOperand(0) == Op.getOperand(2) && | |||
| 9122 | Op.getOperand(1) == Op.getOperand(3) && | |||
| 9123 | Op.getOperand(0) != Op.getOperand(1)) { | |||
| 9124 | SDLoc DL(Op); | |||
| 9125 | MVT VT = Op.getSimpleValueType(); | |||
| 9126 | MVT EltVT = VT.getVectorElementType(); | |||
| 9127 | // Create a new build vector with the first 2 elements followed by undef | |||
| 9128 | // padding, bitcast to v2f64, duplicate, and bitcast back. | |||
| 9129 | SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), | |||
| 9130 | DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; | |||
| 9131 | SDValue NewBV = DAG.getBitcast(MVT::v2f64, DAG.getBuildVector(VT, DL, Ops)); | |||
| 9132 | SDValue Dup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, NewBV); | |||
| 9133 | return DAG.getBitcast(VT, Dup); | |||
| 9134 | } | |||
| 9135 | ||||
| 9136 | // Find all zeroable elements. | |||
| 9137 | std::bitset<4> Zeroable, Undefs; | |||
| 9138 | for (int i = 0; i < 4; ++i) { | |||
| 9139 | SDValue Elt = Op.getOperand(i); | |||
| 9140 | Undefs[i] = Elt.isUndef(); | |||
| 9141 | Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt)); | |||
| 9142 | } | |||
| 9143 | assert(Zeroable.size() - Zeroable.count() > 1 &&(static_cast <bool> (Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!" ) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__ __PRETTY_FUNCTION__)) | |||
| 9144 | "We expect at least two non-zero elements!")(static_cast <bool> (Zeroable.size() - Zeroable.count() > 1 && "We expect at least two non-zero elements!" ) ? void (0) : __assert_fail ("Zeroable.size() - Zeroable.count() > 1 && \"We expect at least two non-zero elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9144, __extension__ __PRETTY_FUNCTION__)); | |||
| 9145 | ||||
| 9146 | // We only know how to deal with build_vector nodes where elements are either | |||
| 9147 | // zeroable or extract_vector_elt with constant index. | |||
| 9148 | SDValue FirstNonZero; | |||
| 9149 | unsigned FirstNonZeroIdx; | |||
| 9150 | for (unsigned i = 0; i < 4; ++i) { | |||
| 9151 | if (Zeroable[i]) | |||
| 9152 | continue; | |||
| 9153 | SDValue Elt = Op.getOperand(i); | |||
| 9154 | if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 9155 | !isa<ConstantSDNode>(Elt.getOperand(1))) | |||
| 9156 | return SDValue(); | |||
| 9157 | // Make sure that this node is extracting from a 128-bit vector. | |||
| 9158 | MVT VT = Elt.getOperand(0).getSimpleValueType(); | |||
| 9159 | if (!VT.is128BitVector()) | |||
| 9160 | return SDValue(); | |||
| 9161 | if (!FirstNonZero.getNode()) { | |||
| 9162 | FirstNonZero = Elt; | |||
| 9163 | FirstNonZeroIdx = i; | |||
| 9164 | } | |||
| 9165 | } | |||
| 9166 | ||||
| 9167 | assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!")(static_cast <bool> (FirstNonZero.getNode() && "Unexpected build vector of all zeros!" ) ? void (0) : __assert_fail ("FirstNonZero.getNode() && \"Unexpected build vector of all zeros!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9167, __extension__ __PRETTY_FUNCTION__)); | |||
| 9168 | SDValue V1 = FirstNonZero.getOperand(0); | |||
| 9169 | MVT VT = V1.getSimpleValueType(); | |||
| 9170 | ||||
| 9171 | // See if this build_vector can be lowered as a blend with zero. | |||
| 9172 | SDValue Elt; | |||
| 9173 | unsigned EltMaskIdx, EltIdx; | |||
| 9174 | int Mask[4]; | |||
| 9175 | for (EltIdx = 0; EltIdx < 4; ++EltIdx) { | |||
| 9176 | if (Zeroable[EltIdx]) { | |||
| 9177 | // The zero vector will be on the right hand side. | |||
| 9178 | Mask[EltIdx] = EltIdx+4; | |||
| 9179 | continue; | |||
| 9180 | } | |||
| 9181 | ||||
| 9182 | Elt = Op->getOperand(EltIdx); | |||
| 9183 | // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index. | |||
| 9184 | EltMaskIdx = Elt.getConstantOperandVal(1); | |||
| 9185 | if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx) | |||
| 9186 | break; | |||
| 9187 | Mask[EltIdx] = EltIdx; | |||
| 9188 | } | |||
| 9189 | ||||
| 9190 | if (EltIdx == 4) { | |||
| 9191 | // Let the shuffle legalizer deal with blend operations. | |||
| 9192 | SDValue VZeroOrUndef = (Zeroable == Undefs) | |||
| 9193 | ? DAG.getUNDEF(VT) | |||
| 9194 | : getZeroVector(VT, Subtarget, DAG, SDLoc(Op)); | |||
| 9195 | if (V1.getSimpleValueType() != VT) | |||
| 9196 | V1 = DAG.getBitcast(VT, V1); | |||
| 9197 | return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZeroOrUndef, Mask); | |||
| 9198 | } | |||
| 9199 | ||||
| 9200 | // See if we can lower this build_vector to a INSERTPS. | |||
| 9201 | if (!Subtarget.hasSSE41()) | |||
| 9202 | return SDValue(); | |||
| 9203 | ||||
| 9204 | SDValue V2 = Elt.getOperand(0); | |||
| 9205 | if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx) | |||
| 9206 | V1 = SDValue(); | |||
| 9207 | ||||
| 9208 | bool CanFold = true; | |||
| 9209 | for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) { | |||
| 9210 | if (Zeroable[i]) | |||
| 9211 | continue; | |||
| 9212 | ||||
| 9213 | SDValue Current = Op->getOperand(i); | |||
| 9214 | SDValue SrcVector = Current->getOperand(0); | |||
| 9215 | if (!V1.getNode()) | |||
| 9216 | V1 = SrcVector; | |||
| 9217 | CanFold = (SrcVector == V1) && (Current.getConstantOperandAPInt(1) == i); | |||
| 9218 | } | |||
| 9219 | ||||
| 9220 | if (!CanFold) | |||
| 9221 | return SDValue(); | |||
| 9222 | ||||
| 9223 | assert(V1.getNode() && "Expected at least two non-zero elements!")(static_cast <bool> (V1.getNode() && "Expected at least two non-zero elements!" ) ? void (0) : __assert_fail ("V1.getNode() && \"Expected at least two non-zero elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9223, __extension__ __PRETTY_FUNCTION__)); | |||
| 9224 | if (V1.getSimpleValueType() != MVT::v4f32) | |||
| 9225 | V1 = DAG.getBitcast(MVT::v4f32, V1); | |||
| 9226 | if (V2.getSimpleValueType() != MVT::v4f32) | |||
| 9227 | V2 = DAG.getBitcast(MVT::v4f32, V2); | |||
| 9228 | ||||
| 9229 | // Ok, we can emit an INSERTPS instruction. | |||
| 9230 | unsigned ZMask = Zeroable.to_ulong(); | |||
| 9231 | ||||
| 9232 | unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask; | |||
| 9233 | assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9233, __extension__ __PRETTY_FUNCTION__)); | |||
| 9234 | SDLoc DL(Op); | |||
| 9235 | SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, | |||
| 9236 | DAG.getIntPtrConstant(InsertPSMask, DL, true)); | |||
| 9237 | return DAG.getBitcast(VT, Result); | |||
| 9238 | } | |||
| 9239 | ||||
| 9240 | /// Return a vector logical shift node. | |||
| 9241 | static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, | |||
| 9242 | SelectionDAG &DAG, const TargetLowering &TLI, | |||
| 9243 | const SDLoc &dl) { | |||
| 9244 | assert(VT.is128BitVector() && "Unknown type for VShift")(static_cast <bool> (VT.is128BitVector() && "Unknown type for VShift" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unknown type for VShift\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9244, __extension__ __PRETTY_FUNCTION__)); | |||
| 9245 | MVT ShVT = MVT::v16i8; | |||
| 9246 | unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; | |||
| 9247 | SrcOp = DAG.getBitcast(ShVT, SrcOp); | |||
| 9248 | assert(NumBits % 8 == 0 && "Only support byte sized shifts")(static_cast <bool> (NumBits % 8 == 0 && "Only support byte sized shifts" ) ? void (0) : __assert_fail ("NumBits % 8 == 0 && \"Only support byte sized shifts\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9248, __extension__ __PRETTY_FUNCTION__)); | |||
| 9249 | SDValue ShiftVal = DAG.getTargetConstant(NumBits / 8, dl, MVT::i8); | |||
| 9250 | return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal)); | |||
| 9251 | } | |||
| 9252 | ||||
| 9253 | static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl, | |||
| 9254 | SelectionDAG &DAG) { | |||
| 9255 | ||||
| 9256 | // Check if the scalar load can be widened into a vector load. And if | |||
| 9257 | // the address is "base + cst" see if the cst can be "absorbed" into | |||
| 9258 | // the shuffle mask. | |||
| 9259 | if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) { | |||
| 9260 | SDValue Ptr = LD->getBasePtr(); | |||
| 9261 | if (!ISD::isNormalLoad(LD) || !LD->isSimple()) | |||
| 9262 | return SDValue(); | |||
| 9263 | EVT PVT = LD->getValueType(0); | |||
| 9264 | if (PVT != MVT::i32 && PVT != MVT::f32) | |||
| 9265 | return SDValue(); | |||
| 9266 | ||||
| 9267 | int FI = -1; | |||
| 9268 | int64_t Offset = 0; | |||
| 9269 | if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) { | |||
| 9270 | FI = FINode->getIndex(); | |||
| 9271 | Offset = 0; | |||
| 9272 | } else if (DAG.isBaseWithConstantOffset(Ptr) && | |||
| 9273 | isa<FrameIndexSDNode>(Ptr.getOperand(0))) { | |||
| 9274 | FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex(); | |||
| 9275 | Offset = Ptr.getConstantOperandVal(1); | |||
| 9276 | Ptr = Ptr.getOperand(0); | |||
| 9277 | } else { | |||
| 9278 | return SDValue(); | |||
| 9279 | } | |||
| 9280 | ||||
| 9281 | // FIXME: 256-bit vector instructions don't require a strict alignment, | |||
| 9282 | // improve this code to support it better. | |||
| 9283 | Align RequiredAlign(VT.getSizeInBits() / 8); | |||
| 9284 | SDValue Chain = LD->getChain(); | |||
| 9285 | // Make sure the stack object alignment is at least 16 or 32. | |||
| 9286 | MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); | |||
| 9287 | MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr); | |||
| 9288 | if (!InferredAlign || *InferredAlign < RequiredAlign) { | |||
| 9289 | if (MFI.isFixedObjectIndex(FI)) { | |||
| 9290 | // Can't change the alignment. FIXME: It's possible to compute | |||
| 9291 | // the exact stack offset and reference FI + adjust offset instead. | |||
| 9292 | // If someone *really* cares about this. That's the way to implement it. | |||
| 9293 | return SDValue(); | |||
| 9294 | } else { | |||
| 9295 | MFI.setObjectAlignment(FI, RequiredAlign); | |||
| 9296 | } | |||
| 9297 | } | |||
| 9298 | ||||
| 9299 | // (Offset % 16 or 32) must be multiple of 4. Then address is then | |||
| 9300 | // Ptr + (Offset & ~15). | |||
| 9301 | if (Offset < 0) | |||
| 9302 | return SDValue(); | |||
| 9303 | if ((Offset % RequiredAlign.value()) & 3) | |||
| 9304 | return SDValue(); | |||
| 9305 | int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1); | |||
| 9306 | if (StartOffset) { | |||
| 9307 | SDLoc DL(Ptr); | |||
| 9308 | Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, | |||
| 9309 | DAG.getConstant(StartOffset, DL, Ptr.getValueType())); | |||
| 9310 | } | |||
| 9311 | ||||
| 9312 | int EltNo = (Offset - StartOffset) >> 2; | |||
| 9313 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 9314 | ||||
| 9315 | EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); | |||
| 9316 | SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, | |||
| 9317 | LD->getPointerInfo().getWithOffset(StartOffset)); | |||
| 9318 | ||||
| 9319 | SmallVector<int, 8> Mask(NumElems, EltNo); | |||
| 9320 | ||||
| 9321 | return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask); | |||
| 9322 | } | |||
| 9323 | ||||
| 9324 | return SDValue(); | |||
| 9325 | } | |||
| 9326 | ||||
| 9327 | // Recurse to find a LoadSDNode source and the accumulated ByteOffest. | |||
| 9328 | static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) { | |||
| 9329 | if (ISD::isNON_EXTLoad(Elt.getNode())) { | |||
| 9330 | auto *BaseLd = cast<LoadSDNode>(Elt); | |||
| 9331 | if (!BaseLd->isSimple()) | |||
| 9332 | return false; | |||
| 9333 | Ld = BaseLd; | |||
| 9334 | ByteOffset = 0; | |||
| 9335 | return true; | |||
| 9336 | } | |||
| 9337 | ||||
| 9338 | switch (Elt.getOpcode()) { | |||
| 9339 | case ISD::BITCAST: | |||
| 9340 | case ISD::TRUNCATE: | |||
| 9341 | case ISD::SCALAR_TO_VECTOR: | |||
| 9342 | return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset); | |||
| 9343 | case ISD::SRL: | |||
| 9344 | if (auto *AmtC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) { | |||
| 9345 | uint64_t Amt = AmtC->getZExtValue(); | |||
| 9346 | if ((Amt % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) { | |||
| 9347 | ByteOffset += Amt / 8; | |||
| 9348 | return true; | |||
| 9349 | } | |||
| 9350 | } | |||
| 9351 | break; | |||
| 9352 | case ISD::EXTRACT_VECTOR_ELT: | |||
| 9353 | if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) { | |||
| 9354 | SDValue Src = Elt.getOperand(0); | |||
| 9355 | unsigned SrcSizeInBits = Src.getScalarValueSizeInBits(); | |||
| 9356 | unsigned DstSizeInBits = Elt.getScalarValueSizeInBits(); | |||
| 9357 | if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 && | |||
| 9358 | findEltLoadSrc(Src, Ld, ByteOffset)) { | |||
| 9359 | uint64_t Idx = IdxC->getZExtValue(); | |||
| 9360 | ByteOffset += Idx * (SrcSizeInBits / 8); | |||
| 9361 | return true; | |||
| 9362 | } | |||
| 9363 | } | |||
| 9364 | break; | |||
| 9365 | } | |||
| 9366 | ||||
| 9367 | return false; | |||
| 9368 | } | |||
| 9369 | ||||
| 9370 | /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the | |||
| 9371 | /// elements can be replaced by a single large load which has the same value as | |||
| 9372 | /// a build_vector or insert_subvector whose loaded operands are 'Elts'. | |||
| 9373 | /// | |||
| 9374 | /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a | |||
| 9375 | static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, | |||
| 9376 | const SDLoc &DL, SelectionDAG &DAG, | |||
| 9377 | const X86Subtarget &Subtarget, | |||
| 9378 | bool IsAfterLegalize) { | |||
| 9379 | if ((VT.getScalarSizeInBits() % 8) != 0) | |||
| 9380 | return SDValue(); | |||
| 9381 | ||||
| 9382 | unsigned NumElems = Elts.size(); | |||
| 9383 | ||||
| 9384 | int LastLoadedElt = -1; | |||
| 9385 | APInt LoadMask = APInt::getZero(NumElems); | |||
| 9386 | APInt ZeroMask = APInt::getZero(NumElems); | |||
| 9387 | APInt UndefMask = APInt::getZero(NumElems); | |||
| 9388 | ||||
| 9389 | SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr); | |||
| 9390 | SmallVector<int64_t, 8> ByteOffsets(NumElems, 0); | |||
| 9391 | ||||
| 9392 | // For each element in the initializer, see if we've found a load, zero or an | |||
| 9393 | // undef. | |||
| 9394 | for (unsigned i = 0; i < NumElems; ++i) { | |||
| 9395 | SDValue Elt = peekThroughBitcasts(Elts[i]); | |||
| 9396 | if (!Elt.getNode()) | |||
| 9397 | return SDValue(); | |||
| 9398 | if (Elt.isUndef()) { | |||
| 9399 | UndefMask.setBit(i); | |||
| 9400 | continue; | |||
| 9401 | } | |||
| 9402 | if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode())) { | |||
| 9403 | ZeroMask.setBit(i); | |||
| 9404 | continue; | |||
| 9405 | } | |||
| 9406 | ||||
| 9407 | // Each loaded element must be the correct fractional portion of the | |||
| 9408 | // requested vector load. | |||
| 9409 | unsigned EltSizeInBits = Elt.getValueSizeInBits(); | |||
| 9410 | if ((NumElems * EltSizeInBits) != VT.getSizeInBits()) | |||
| 9411 | return SDValue(); | |||
| 9412 | ||||
| 9413 | if (!findEltLoadSrc(Elt, Loads[i], ByteOffsets[i]) || ByteOffsets[i] < 0) | |||
| 9414 | return SDValue(); | |||
| 9415 | unsigned LoadSizeInBits = Loads[i]->getValueSizeInBits(0); | |||
| 9416 | if (((ByteOffsets[i] * 8) + EltSizeInBits) > LoadSizeInBits) | |||
| 9417 | return SDValue(); | |||
| 9418 | ||||
| 9419 | LoadMask.setBit(i); | |||
| 9420 | LastLoadedElt = i; | |||
| 9421 | } | |||
| 9422 | assert((ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) ==(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount () + LoadMask.popcount()) == NumElems && "Incomplete element masks" ) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__ __PRETTY_FUNCTION__)) | |||
| 9423 | NumElems &&(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount () + LoadMask.popcount()) == NumElems && "Incomplete element masks" ) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__ __PRETTY_FUNCTION__)) | |||
| 9424 | "Incomplete element masks")(static_cast <bool> ((ZeroMask.popcount() + UndefMask.popcount () + LoadMask.popcount()) == NumElems && "Incomplete element masks" ) ? void (0) : __assert_fail ("(ZeroMask.popcount() + UndefMask.popcount() + LoadMask.popcount()) == NumElems && \"Incomplete element masks\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9424, __extension__ __PRETTY_FUNCTION__)); | |||
| 9425 | ||||
| 9426 | // Handle Special Cases - all undef or undef/zero. | |||
| 9427 | if (UndefMask.popcount() == NumElems) | |||
| 9428 | return DAG.getUNDEF(VT); | |||
| 9429 | if ((ZeroMask.popcount() + UndefMask.popcount()) == NumElems) | |||
| 9430 | return VT.isInteger() ? DAG.getConstant(0, DL, VT) | |||
| 9431 | : DAG.getConstantFP(0.0, DL, VT); | |||
| 9432 | ||||
| 9433 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 9434 | int FirstLoadedElt = LoadMask.countr_zero(); | |||
| 9435 | SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]); | |||
| 9436 | EVT EltBaseVT = EltBase.getValueType(); | |||
| 9437 | assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT .getStoreSizeInBits() && "Register/Memory size mismatch" ) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__ __PRETTY_FUNCTION__)) | |||
| 9438 | "Register/Memory size mismatch")(static_cast <bool> (EltBaseVT.getSizeInBits() == EltBaseVT .getStoreSizeInBits() && "Register/Memory size mismatch" ) ? void (0) : __assert_fail ("EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() && \"Register/Memory size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9438, __extension__ __PRETTY_FUNCTION__)); | |||
| 9439 | LoadSDNode *LDBase = Loads[FirstLoadedElt]; | |||
| 9440 | assert(LDBase && "Did not find base load for merging consecutive loads")(static_cast <bool> (LDBase && "Did not find base load for merging consecutive loads" ) ? void (0) : __assert_fail ("LDBase && \"Did not find base load for merging consecutive loads\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9440, __extension__ __PRETTY_FUNCTION__)); | |||
| 9441 | unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits(); | |||
| 9442 | unsigned BaseSizeInBytes = BaseSizeInBits / 8; | |||
| 9443 | int NumLoadedElts = (1 + LastLoadedElt - FirstLoadedElt); | |||
| 9444 | int LoadSizeInBits = NumLoadedElts * BaseSizeInBits; | |||
| 9445 | assert((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected")(static_cast <bool> ((BaseSizeInBits % 8) == 0 && "Sub-byte element loads detected") ? void (0) : __assert_fail ("(BaseSizeInBits % 8) == 0 && \"Sub-byte element loads detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9445, __extension__ __PRETTY_FUNCTION__)); | |||
| 9446 | ||||
| 9447 | // TODO: Support offsetting the base load. | |||
| 9448 | if (ByteOffsets[FirstLoadedElt] != 0) | |||
| 9449 | return SDValue(); | |||
| 9450 | ||||
| 9451 | // Check to see if the element's load is consecutive to the base load | |||
| 9452 | // or offset from a previous (already checked) load. | |||
| 9453 | auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) { | |||
| 9454 | LoadSDNode *Ld = Loads[EltIdx]; | |||
| 9455 | int64_t ByteOffset = ByteOffsets[EltIdx]; | |||
| 9456 | if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) { | |||
| 9457 | int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes); | |||
| 9458 | return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] && | |||
| 9459 | Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0); | |||
| 9460 | } | |||
| 9461 | return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes, | |||
| 9462 | EltIdx - FirstLoadedElt); | |||
| 9463 | }; | |||
| 9464 | ||||
| 9465 | // Consecutive loads can contain UNDEFS but not ZERO elements. | |||
| 9466 | // Consecutive loads with UNDEFs and ZEROs elements require a | |||
| 9467 | // an additional shuffle stage to clear the ZERO elements. | |||
| 9468 | bool IsConsecutiveLoad = true; | |||
| 9469 | bool IsConsecutiveLoadWithZeros = true; | |||
| 9470 | for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) { | |||
| 9471 | if (LoadMask[i]) { | |||
| 9472 | if (!CheckConsecutiveLoad(LDBase, i)) { | |||
| 9473 | IsConsecutiveLoad = false; | |||
| 9474 | IsConsecutiveLoadWithZeros = false; | |||
| 9475 | break; | |||
| 9476 | } | |||
| 9477 | } else if (ZeroMask[i]) { | |||
| 9478 | IsConsecutiveLoad = false; | |||
| 9479 | } | |||
| 9480 | } | |||
| 9481 | ||||
| 9482 | auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { | |||
| 9483 | auto MMOFlags = LDBase->getMemOperand()->getFlags(); | |||
| 9484 | assert(LDBase->isSimple() &&(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads." ) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__ __PRETTY_FUNCTION__)) | |||
| 9485 | "Cannot merge volatile or atomic loads.")(static_cast <bool> (LDBase->isSimple() && "Cannot merge volatile or atomic loads." ) ? void (0) : __assert_fail ("LDBase->isSimple() && \"Cannot merge volatile or atomic loads.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9485, __extension__ __PRETTY_FUNCTION__)); | |||
| 9486 | SDValue NewLd = | |||
| 9487 | DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), | |||
| 9488 | LDBase->getPointerInfo(), LDBase->getOriginalAlign(), | |||
| 9489 | MMOFlags); | |||
| 9490 | for (auto *LD : Loads) | |||
| 9491 | if (LD) | |||
| 9492 | DAG.makeEquivalentMemoryOrdering(LD, NewLd); | |||
| 9493 | return NewLd; | |||
| 9494 | }; | |||
| 9495 | ||||
| 9496 | // Check if the base load is entirely dereferenceable. | |||
| 9497 | bool IsDereferenceable = LDBase->getPointerInfo().isDereferenceable( | |||
| 9498 | VT.getSizeInBits() / 8, *DAG.getContext(), DAG.getDataLayout()); | |||
| 9499 | ||||
| 9500 | // LOAD - all consecutive load/undefs (must start/end with a load or be | |||
| 9501 | // entirely dereferenceable). If we have found an entire vector of loads and | |||
| 9502 | // undefs, then return a large load of the entire vector width starting at the | |||
| 9503 | // base pointer. If the vector contains zeros, then attempt to shuffle those | |||
| 9504 | // elements. | |||
| 9505 | if (FirstLoadedElt == 0 && | |||
| 9506 | (NumLoadedElts == (int)NumElems || IsDereferenceable) && | |||
| 9507 | (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) { | |||
| 9508 | if (IsAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT)) | |||
| 9509 | return SDValue(); | |||
| 9510 | ||||
| 9511 | // Don't create 256-bit non-temporal aligned loads without AVX2 as these | |||
| 9512 | // will lower to regular temporal loads and use the cache. | |||
| 9513 | if (LDBase->isNonTemporal() && LDBase->getAlign() >= Align(32) && | |||
| 9514 | VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 9515 | return SDValue(); | |||
| 9516 | ||||
| 9517 | if (NumElems == 1) | |||
| 9518 | return DAG.getBitcast(VT, Elts[FirstLoadedElt]); | |||
| 9519 | ||||
| 9520 | if (!ZeroMask) | |||
| 9521 | return CreateLoad(VT, LDBase); | |||
| 9522 | ||||
| 9523 | // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded | |||
| 9524 | // vector and a zero vector to clear out the zero elements. | |||
| 9525 | if (!IsAfterLegalize && VT.isVector()) { | |||
| 9526 | unsigned NumMaskElts = VT.getVectorNumElements(); | |||
| 9527 | if ((NumMaskElts % NumElems) == 0) { | |||
| 9528 | unsigned Scale = NumMaskElts / NumElems; | |||
| 9529 | SmallVector<int, 4> ClearMask(NumMaskElts, -1); | |||
| 9530 | for (unsigned i = 0; i < NumElems; ++i) { | |||
| 9531 | if (UndefMask[i]) | |||
| 9532 | continue; | |||
| 9533 | int Offset = ZeroMask[i] ? NumMaskElts : 0; | |||
| 9534 | for (unsigned j = 0; j != Scale; ++j) | |||
| 9535 | ClearMask[(i * Scale) + j] = (i * Scale) + j + Offset; | |||
| 9536 | } | |||
| 9537 | SDValue V = CreateLoad(VT, LDBase); | |||
| 9538 | SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT) | |||
| 9539 | : DAG.getConstantFP(0.0, DL, VT); | |||
| 9540 | return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask); | |||
| 9541 | } | |||
| 9542 | } | |||
| 9543 | } | |||
| 9544 | ||||
| 9545 | // If the upper half of a ymm/zmm load is undef then just load the lower half. | |||
| 9546 | if (VT.is256BitVector() || VT.is512BitVector()) { | |||
| 9547 | unsigned HalfNumElems = NumElems / 2; | |||
| 9548 | if (UndefMask.extractBits(HalfNumElems, HalfNumElems).isAllOnes()) { | |||
| 9549 | EVT HalfVT = | |||
| 9550 | EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(), HalfNumElems); | |||
| 9551 | SDValue HalfLD = | |||
| 9552 | EltsFromConsecutiveLoads(HalfVT, Elts.drop_back(HalfNumElems), DL, | |||
| 9553 | DAG, Subtarget, IsAfterLegalize); | |||
| 9554 | if (HalfLD) | |||
| 9555 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), | |||
| 9556 | HalfLD, DAG.getIntPtrConstant(0, DL)); | |||
| 9557 | } | |||
| 9558 | } | |||
| 9559 | ||||
| 9560 | // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs. | |||
| 9561 | if (IsConsecutiveLoad && FirstLoadedElt == 0 && | |||
| 9562 | ((LoadSizeInBits == 16 && Subtarget.hasFP16()) || LoadSizeInBits == 32 || | |||
| 9563 | LoadSizeInBits == 64) && | |||
| 9564 | ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) { | |||
| 9565 | MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits) | |||
| 9566 | : MVT::getIntegerVT(LoadSizeInBits); | |||
| 9567 | MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits); | |||
| 9568 | // Allow v4f32 on SSE1 only targets. | |||
| 9569 | // FIXME: Add more isel patterns so we can just use VT directly. | |||
| 9570 | if (!Subtarget.hasSSE2() && VT == MVT::v4f32) | |||
| 9571 | VecVT = MVT::v4f32; | |||
| 9572 | if (TLI.isTypeLegal(VecVT)) { | |||
| 9573 | SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); | |||
| 9574 | SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; | |||
| 9575 | SDValue ResNode = DAG.getMemIntrinsicNode( | |||
| 9576 | X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), | |||
| 9577 | LDBase->getOriginalAlign(), MachineMemOperand::MOLoad); | |||
| 9578 | for (auto *LD : Loads) | |||
| 9579 | if (LD) | |||
| 9580 | DAG.makeEquivalentMemoryOrdering(LD, ResNode); | |||
| 9581 | return DAG.getBitcast(VT, ResNode); | |||
| 9582 | } | |||
| 9583 | } | |||
| 9584 | ||||
| 9585 | // BROADCAST - match the smallest possible repetition pattern, load that | |||
| 9586 | // scalar/subvector element and then broadcast to the entire vector. | |||
| 9587 | if (ZeroMask.isZero() && isPowerOf2_32(NumElems) && Subtarget.hasAVX() && | |||
| 9588 | (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) { | |||
| 9589 | for (unsigned SubElems = 1; SubElems < NumElems; SubElems *= 2) { | |||
| 9590 | unsigned RepeatSize = SubElems * BaseSizeInBits; | |||
| 9591 | unsigned ScalarSize = std::min(RepeatSize, 64u); | |||
| 9592 | if (!Subtarget.hasAVX2() && ScalarSize < 32) | |||
| 9593 | continue; | |||
| 9594 | ||||
| 9595 | // Don't attempt a 1:N subvector broadcast - it should be caught by | |||
| 9596 | // combineConcatVectorOps, else will cause infinite loops. | |||
| 9597 | if (RepeatSize > ScalarSize && SubElems == 1) | |||
| 9598 | continue; | |||
| 9599 | ||||
| 9600 | bool Match = true; | |||
| 9601 | SmallVector<SDValue, 8> RepeatedLoads(SubElems, DAG.getUNDEF(EltBaseVT)); | |||
| 9602 | for (unsigned i = 0; i != NumElems && Match; ++i) { | |||
| 9603 | if (!LoadMask[i]) | |||
| 9604 | continue; | |||
| 9605 | SDValue Elt = peekThroughBitcasts(Elts[i]); | |||
| 9606 | if (RepeatedLoads[i % SubElems].isUndef()) | |||
| 9607 | RepeatedLoads[i % SubElems] = Elt; | |||
| 9608 | else | |||
| 9609 | Match &= (RepeatedLoads[i % SubElems] == Elt); | |||
| 9610 | } | |||
| 9611 | ||||
| 9612 | // We must have loads at both ends of the repetition. | |||
| 9613 | Match &= !RepeatedLoads.front().isUndef(); | |||
| 9614 | Match &= !RepeatedLoads.back().isUndef(); | |||
| 9615 | if (!Match) | |||
| 9616 | continue; | |||
| 9617 | ||||
| 9618 | EVT RepeatVT = | |||
| 9619 | VT.isInteger() && (RepeatSize != 64 || TLI.isTypeLegal(MVT::i64)) | |||
| 9620 | ? EVT::getIntegerVT(*DAG.getContext(), ScalarSize) | |||
| 9621 | : EVT::getFloatingPointVT(ScalarSize); | |||
| 9622 | if (RepeatSize > ScalarSize) | |||
| 9623 | RepeatVT = EVT::getVectorVT(*DAG.getContext(), RepeatVT, | |||
| 9624 | RepeatSize / ScalarSize); | |||
| 9625 | EVT BroadcastVT = | |||
| 9626 | EVT::getVectorVT(*DAG.getContext(), RepeatVT.getScalarType(), | |||
| 9627 | VT.getSizeInBits() / ScalarSize); | |||
| 9628 | if (TLI.isTypeLegal(BroadcastVT)) { | |||
| 9629 | if (SDValue RepeatLoad = EltsFromConsecutiveLoads( | |||
| 9630 | RepeatVT, RepeatedLoads, DL, DAG, Subtarget, IsAfterLegalize)) { | |||
| 9631 | SDValue Broadcast = RepeatLoad; | |||
| 9632 | if (RepeatSize > ScalarSize) { | |||
| 9633 | while (Broadcast.getValueSizeInBits() < VT.getSizeInBits()) | |||
| 9634 | Broadcast = concatSubVectors(Broadcast, Broadcast, DAG, DL); | |||
| 9635 | } else { | |||
| 9636 | if (!Subtarget.hasAVX2() && | |||
| 9637 | !X86::mayFoldLoadIntoBroadcastFromMem( | |||
| 9638 | RepeatLoad, RepeatVT.getScalarType().getSimpleVT(), | |||
| 9639 | Subtarget, | |||
| 9640 | /*AssumeSingleUse=*/true)) | |||
| 9641 | return SDValue(); | |||
| 9642 | Broadcast = | |||
| 9643 | DAG.getNode(X86ISD::VBROADCAST, DL, BroadcastVT, RepeatLoad); | |||
| 9644 | } | |||
| 9645 | return DAG.getBitcast(VT, Broadcast); | |||
| 9646 | } | |||
| 9647 | } | |||
| 9648 | } | |||
| 9649 | } | |||
| 9650 | ||||
| 9651 | return SDValue(); | |||
| 9652 | } | |||
| 9653 | ||||
| 9654 | // Combine a vector ops (shuffles etc.) that is equal to build_vector load1, | |||
| 9655 | // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses | |||
| 9656 | // are consecutive, non-overlapping, and in the right order. | |||
| 9657 | static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL, | |||
| 9658 | SelectionDAG &DAG, | |||
| 9659 | const X86Subtarget &Subtarget, | |||
| 9660 | bool IsAfterLegalize) { | |||
| 9661 | SmallVector<SDValue, 64> Elts; | |||
| 9662 | for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { | |||
| 9663 | if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) { | |||
| 9664 | Elts.push_back(Elt); | |||
| 9665 | continue; | |||
| 9666 | } | |||
| 9667 | return SDValue(); | |||
| 9668 | } | |||
| 9669 | assert(Elts.size() == VT.getVectorNumElements())(static_cast <bool> (Elts.size() == VT.getVectorNumElements ()) ? void (0) : __assert_fail ("Elts.size() == VT.getVectorNumElements()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9669, __extension__ __PRETTY_FUNCTION__)); | |||
| 9670 | return EltsFromConsecutiveLoads(VT, Elts, DL, DAG, Subtarget, | |||
| 9671 | IsAfterLegalize); | |||
| 9672 | } | |||
| 9673 | ||||
| 9674 | static Constant *getConstantVector(MVT VT, const APInt &SplatValue, | |||
| 9675 | unsigned SplatBitSize, LLVMContext &C) { | |||
| 9676 | unsigned ScalarSize = VT.getScalarSizeInBits(); | |||
| 9677 | unsigned NumElm = SplatBitSize / ScalarSize; | |||
| 9678 | ||||
| 9679 | SmallVector<Constant *, 32> ConstantVec; | |||
| 9680 | for (unsigned i = 0; i < NumElm; i++) { | |||
| 9681 | APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i); | |||
| 9682 | Constant *Const; | |||
| 9683 | if (VT.isFloatingPoint()) { | |||
| 9684 | if (ScalarSize == 16) { | |||
| 9685 | Const = ConstantFP::get(C, APFloat(APFloat::IEEEhalf(), Val)); | |||
| 9686 | } else if (ScalarSize == 32) { | |||
| 9687 | Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val)); | |||
| 9688 | } else { | |||
| 9689 | assert(ScalarSize == 64 && "Unsupported floating point scalar size")(static_cast <bool> (ScalarSize == 64 && "Unsupported floating point scalar size" ) ? void (0) : __assert_fail ("ScalarSize == 64 && \"Unsupported floating point scalar size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9689, __extension__ __PRETTY_FUNCTION__)); | |||
| 9690 | Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val)); | |||
| 9691 | } | |||
| 9692 | } else | |||
| 9693 | Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val); | |||
| 9694 | ConstantVec.push_back(Const); | |||
| 9695 | } | |||
| 9696 | return ConstantVector::get(ArrayRef<Constant *>(ConstantVec)); | |||
| 9697 | } | |||
| 9698 | ||||
| 9699 | static bool isFoldableUseOfShuffle(SDNode *N) { | |||
| 9700 | for (auto *U : N->uses()) { | |||
| 9701 | unsigned Opc = U->getOpcode(); | |||
| 9702 | // VPERMV/VPERMV3 shuffles can never fold their index operands. | |||
| 9703 | if (Opc == X86ISD::VPERMV && U->getOperand(0).getNode() == N) | |||
| 9704 | return false; | |||
| 9705 | if (Opc == X86ISD::VPERMV3 && U->getOperand(1).getNode() == N) | |||
| 9706 | return false; | |||
| 9707 | if (isTargetShuffle(Opc)) | |||
| 9708 | return true; | |||
| 9709 | if (Opc == ISD::BITCAST) // Ignore bitcasts | |||
| 9710 | return isFoldableUseOfShuffle(U); | |||
| 9711 | if (N->hasOneUse()) { | |||
| 9712 | // TODO, there may be some general way to know if a SDNode can | |||
| 9713 | // be folded. We now only know whether an MI is foldable. | |||
| 9714 | if (Opc == X86ISD::VPDPBUSD && U->getOperand(2).getNode() != N) | |||
| 9715 | return false; | |||
| 9716 | return true; | |||
| 9717 | } | |||
| 9718 | } | |||
| 9719 | return false; | |||
| 9720 | } | |||
| 9721 | ||||
| 9722 | /// Attempt to use the vbroadcast instruction to generate a splat value | |||
| 9723 | /// from a splat BUILD_VECTOR which uses: | |||
| 9724 | /// a. A single scalar load, or a constant. | |||
| 9725 | /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>). | |||
| 9726 | /// | |||
| 9727 | /// The VBROADCAST node is returned when a pattern is found, | |||
| 9728 | /// or SDValue() otherwise. | |||
| 9729 | static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, | |||
| 9730 | const X86Subtarget &Subtarget, | |||
| 9731 | SelectionDAG &DAG) { | |||
| 9732 | // VBROADCAST requires AVX. | |||
| 9733 | // TODO: Splats could be generated for non-AVX CPUs using SSE | |||
| 9734 | // instructions, but there's less potential gain for only 128-bit vectors. | |||
| 9735 | if (!Subtarget.hasAVX()) | |||
| 9736 | return SDValue(); | |||
| 9737 | ||||
| 9738 | MVT VT = BVOp->getSimpleValueType(0); | |||
| 9739 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 9740 | SDLoc dl(BVOp); | |||
| 9741 | ||||
| 9742 | assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Unsupported vector type for broadcast." ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__ __PRETTY_FUNCTION__)) | |||
| 9743 | "Unsupported vector type for broadcast.")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Unsupported vector type for broadcast." ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Unsupported vector type for broadcast.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9743, __extension__ __PRETTY_FUNCTION__)); | |||
| 9744 | ||||
| 9745 | // See if the build vector is a repeating sequence of scalars (inc. splat). | |||
| 9746 | SDValue Ld; | |||
| 9747 | BitVector UndefElements; | |||
| 9748 | SmallVector<SDValue, 16> Sequence; | |||
| 9749 | if (BVOp->getRepeatedSequence(Sequence, &UndefElements)) { | |||
| 9750 | assert((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.")(static_cast <bool> ((NumElts % Sequence.size()) == 0 && "Sequence doesn't fit.") ? void (0) : __assert_fail ("(NumElts % Sequence.size()) == 0 && \"Sequence doesn't fit.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9750, __extension__ __PRETTY_FUNCTION__)); | |||
| 9751 | if (Sequence.size() == 1) | |||
| 9752 | Ld = Sequence[0]; | |||
| 9753 | } | |||
| 9754 | ||||
| 9755 | // Attempt to use VBROADCASTM | |||
| 9756 | // From this pattern: | |||
| 9757 | // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) | |||
| 9758 | // b. t1 = (build_vector t0 t0) | |||
| 9759 | // | |||
| 9760 | // Create (VBROADCASTM v2i1 X) | |||
| 9761 | if (!Sequence.empty() && Subtarget.hasCDI()) { | |||
| 9762 | // If not a splat, are the upper sequence values zeroable? | |||
| 9763 | unsigned SeqLen = Sequence.size(); | |||
| 9764 | bool UpperZeroOrUndef = | |||
| 9765 | SeqLen == 1 || | |||
| 9766 | llvm::all_of(ArrayRef(Sequence).drop_front(), [](SDValue V) { | |||
| 9767 | return !V || V.isUndef() || isNullConstant(V); | |||
| 9768 | }); | |||
| 9769 | SDValue Op0 = Sequence[0]; | |||
| 9770 | if (UpperZeroOrUndef && ((Op0.getOpcode() == ISD::BITCAST) || | |||
| 9771 | (Op0.getOpcode() == ISD::ZERO_EXTEND && | |||
| 9772 | Op0.getOperand(0).getOpcode() == ISD::BITCAST))) { | |||
| 9773 | SDValue BOperand = Op0.getOpcode() == ISD::BITCAST | |||
| 9774 | ? Op0.getOperand(0) | |||
| 9775 | : Op0.getOperand(0).getOperand(0); | |||
| 9776 | MVT MaskVT = BOperand.getSimpleValueType(); | |||
| 9777 | MVT EltType = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen); | |||
| 9778 | if ((EltType == MVT::i64 && MaskVT == MVT::v8i1) || // for broadcastmb2q | |||
| 9779 | (EltType == MVT::i32 && MaskVT == MVT::v16i1)) { // for broadcastmw2d | |||
| 9780 | MVT BcstVT = MVT::getVectorVT(EltType, NumElts / SeqLen); | |||
| 9781 | if (!VT.is512BitVector() && !Subtarget.hasVLX()) { | |||
| 9782 | unsigned Scale = 512 / VT.getSizeInBits(); | |||
| 9783 | BcstVT = MVT::getVectorVT(EltType, Scale * (NumElts / SeqLen)); | |||
| 9784 | } | |||
| 9785 | SDValue Bcst = DAG.getNode(X86ISD::VBROADCASTM, dl, BcstVT, BOperand); | |||
| 9786 | if (BcstVT.getSizeInBits() != VT.getSizeInBits()) | |||
| 9787 | Bcst = extractSubVector(Bcst, 0, DAG, dl, VT.getSizeInBits()); | |||
| 9788 | return DAG.getBitcast(VT, Bcst); | |||
| 9789 | } | |||
| 9790 | } | |||
| 9791 | } | |||
| 9792 | ||||
| 9793 | unsigned NumUndefElts = UndefElements.count(); | |||
| 9794 | if (!Ld || (NumElts - NumUndefElts) <= 1) { | |||
| 9795 | APInt SplatValue, Undef; | |||
| 9796 | unsigned SplatBitSize; | |||
| 9797 | bool HasUndef; | |||
| 9798 | // Check if this is a repeated constant pattern suitable for broadcasting. | |||
| 9799 | if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) && | |||
| 9800 | SplatBitSize > VT.getScalarSizeInBits() && | |||
| 9801 | SplatBitSize < VT.getSizeInBits()) { | |||
| 9802 | // Avoid replacing with broadcast when it's a use of a shuffle | |||
| 9803 | // instruction to preserve the present custom lowering of shuffles. | |||
| 9804 | if (isFoldableUseOfShuffle(BVOp)) | |||
| 9805 | return SDValue(); | |||
| 9806 | // replace BUILD_VECTOR with broadcast of the repeated constants. | |||
| 9807 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 9808 | LLVMContext *Ctx = DAG.getContext(); | |||
| 9809 | MVT PVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 9810 | if (Subtarget.hasAVX()) { | |||
| 9811 | if (SplatBitSize == 32 || SplatBitSize == 64 || | |||
| 9812 | (SplatBitSize < 32 && Subtarget.hasAVX2())) { | |||
| 9813 | // Splatted value can fit in one INTEGER constant in constant pool. | |||
| 9814 | // Load the constant and broadcast it. | |||
| 9815 | MVT CVT = MVT::getIntegerVT(SplatBitSize); | |||
| 9816 | Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize); | |||
| 9817 | Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue); | |||
| 9818 | SDValue CP = DAG.getConstantPool(C, PVT); | |||
| 9819 | unsigned Repeat = VT.getSizeInBits() / SplatBitSize; | |||
| 9820 | ||||
| 9821 | Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); | |||
| 9822 | SDVTList Tys = | |||
| 9823 | DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other); | |||
| 9824 | SDValue Ops[] = {DAG.getEntryNode(), CP}; | |||
| 9825 | MachinePointerInfo MPI = | |||
| 9826 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); | |||
| 9827 | SDValue Brdcst = DAG.getMemIntrinsicNode( | |||
| 9828 | X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment, | |||
| 9829 | MachineMemOperand::MOLoad); | |||
| 9830 | return DAG.getBitcast(VT, Brdcst); | |||
| 9831 | } | |||
| 9832 | if (SplatBitSize > 64) { | |||
| 9833 | // Load the vector of constants and broadcast it. | |||
| 9834 | Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize, | |||
| 9835 | *Ctx); | |||
| 9836 | SDValue VCP = DAG.getConstantPool(VecC, PVT); | |||
| 9837 | unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits(); | |||
| 9838 | MVT VVT = MVT::getVectorVT(VT.getScalarType(), NumElm); | |||
| 9839 | Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign(); | |||
| 9840 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 9841 | SDValue Ops[] = {DAG.getEntryNode(), VCP}; | |||
| 9842 | MachinePointerInfo MPI = | |||
| 9843 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); | |||
| 9844 | return DAG.getMemIntrinsicNode( | |||
| 9845 | X86ISD::SUBV_BROADCAST_LOAD, dl, Tys, Ops, VVT, MPI, Alignment, | |||
| 9846 | MachineMemOperand::MOLoad); | |||
| 9847 | } | |||
| 9848 | } | |||
| 9849 | } | |||
| 9850 | ||||
| 9851 | // If we are moving a scalar into a vector (Ld must be set and all elements | |||
| 9852 | // but 1 are undef) and that operation is not obviously supported by | |||
| 9853 | // vmovd/vmovq/vmovss/vmovsd, then keep trying to form a broadcast. | |||
| 9854 | // That's better than general shuffling and may eliminate a load to GPR and | |||
| 9855 | // move from scalar to vector register. | |||
| 9856 | if (!Ld || NumElts - NumUndefElts != 1) | |||
| 9857 | return SDValue(); | |||
| 9858 | unsigned ScalarSize = Ld.getValueSizeInBits(); | |||
| 9859 | if (!(UndefElements[0] || (ScalarSize != 32 && ScalarSize != 64))) | |||
| 9860 | return SDValue(); | |||
| 9861 | } | |||
| 9862 | ||||
| 9863 | bool ConstSplatVal = | |||
| 9864 | (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); | |||
| 9865 | bool IsLoad = ISD::isNormalLoad(Ld.getNode()); | |||
| 9866 | ||||
| 9867 | // TODO: Handle broadcasts of non-constant sequences. | |||
| 9868 | ||||
| 9869 | // Make sure that all of the users of a non-constant load are from the | |||
| 9870 | // BUILD_VECTOR node. | |||
| 9871 | // FIXME: Is the use count needed for non-constant, non-load case? | |||
| 9872 | if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode())) | |||
| 9873 | return SDValue(); | |||
| 9874 | ||||
| 9875 | unsigned ScalarSize = Ld.getValueSizeInBits(); | |||
| 9876 | bool IsGE256 = (VT.getSizeInBits() >= 256); | |||
| 9877 | ||||
| 9878 | // When optimizing for size, generate up to 5 extra bytes for a broadcast | |||
| 9879 | // instruction to save 8 or more bytes of constant pool data. | |||
| 9880 | // TODO: If multiple splats are generated to load the same constant, | |||
| 9881 | // it may be detrimental to overall size. There needs to be a way to detect | |||
| 9882 | // that condition to know if this is truly a size win. | |||
| 9883 | bool OptForSize = DAG.shouldOptForSize(); | |||
| 9884 | ||||
| 9885 | // Handle broadcasting a single constant scalar from the constant pool | |||
| 9886 | // into a vector. | |||
| 9887 | // On Sandybridge (no AVX2), it is still better to load a constant vector | |||
| 9888 | // from the constant pool and not to broadcast it from a scalar. | |||
| 9889 | // But override that restriction when optimizing for size. | |||
| 9890 | // TODO: Check if splatting is recommended for other AVX-capable CPUs. | |||
| 9891 | if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) { | |||
| 9892 | EVT CVT = Ld.getValueType(); | |||
| 9893 | assert(!CVT.isVector() && "Must not broadcast a vector type")(static_cast <bool> (!CVT.isVector() && "Must not broadcast a vector type" ) ? void (0) : __assert_fail ("!CVT.isVector() && \"Must not broadcast a vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9893, __extension__ __PRETTY_FUNCTION__)); | |||
| 9894 | ||||
| 9895 | // Splat f16, f32, i32, v4f64, v4i64 in all cases with AVX2. | |||
| 9896 | // For size optimization, also splat v2f64 and v2i64, and for size opt | |||
| 9897 | // with AVX2, also splat i8 and i16. | |||
| 9898 | // With pattern matching, the VBROADCAST node may become a VMOVDDUP. | |||
| 9899 | if (ScalarSize == 32 || | |||
| 9900 | (ScalarSize == 64 && (IsGE256 || Subtarget.hasVLX())) || | |||
| 9901 | CVT == MVT::f16 || | |||
| 9902 | (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) { | |||
| 9903 | const Constant *C = nullptr; | |||
| 9904 | if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld)) | |||
| 9905 | C = CI->getConstantIntValue(); | |||
| 9906 | else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld)) | |||
| 9907 | C = CF->getConstantFPValue(); | |||
| 9908 | ||||
| 9909 | assert(C && "Invalid constant type")(static_cast <bool> (C && "Invalid constant type" ) ? void (0) : __assert_fail ("C && \"Invalid constant type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9909, __extension__ __PRETTY_FUNCTION__)); | |||
| 9910 | ||||
| 9911 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 9912 | SDValue CP = | |||
| 9913 | DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout())); | |||
| 9914 | Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); | |||
| 9915 | ||||
| 9916 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 9917 | SDValue Ops[] = {DAG.getEntryNode(), CP}; | |||
| 9918 | MachinePointerInfo MPI = | |||
| 9919 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); | |||
| 9920 | return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, | |||
| 9921 | MPI, Alignment, MachineMemOperand::MOLoad); | |||
| 9922 | } | |||
| 9923 | } | |||
| 9924 | ||||
| 9925 | // Handle AVX2 in-register broadcasts. | |||
| 9926 | if (!IsLoad && Subtarget.hasInt256() && | |||
| 9927 | (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) | |||
| 9928 | return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); | |||
| 9929 | ||||
| 9930 | // The scalar source must be a normal load. | |||
| 9931 | if (!IsLoad) | |||
| 9932 | return SDValue(); | |||
| 9933 | ||||
| 9934 | // Make sure the non-chain result is only used by this build vector. | |||
| 9935 | if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0)) | |||
| 9936 | return SDValue(); | |||
| 9937 | ||||
| 9938 | if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) || | |||
| 9939 | (Subtarget.hasVLX() && ScalarSize == 64)) { | |||
| 9940 | auto *LN = cast<LoadSDNode>(Ld); | |||
| 9941 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 9942 | SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; | |||
| 9943 | SDValue BCast = | |||
| 9944 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, | |||
| 9945 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 9946 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); | |||
| 9947 | return BCast; | |||
| 9948 | } | |||
| 9949 | ||||
| 9950 | // The integer check is needed for the 64-bit into 128-bit so it doesn't match | |||
| 9951 | // double since there is no vbroadcastsd xmm | |||
| 9952 | if (Subtarget.hasInt256() && Ld.getValueType().isInteger() && | |||
| 9953 | (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) { | |||
| 9954 | auto *LN = cast<LoadSDNode>(Ld); | |||
| 9955 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 9956 | SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; | |||
| 9957 | SDValue BCast = | |||
| 9958 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, | |||
| 9959 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 9960 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1)); | |||
| 9961 | return BCast; | |||
| 9962 | } | |||
| 9963 | ||||
| 9964 | if (ScalarSize == 16 && Subtarget.hasFP16() && IsGE256) | |||
| 9965 | return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); | |||
| 9966 | ||||
| 9967 | // Unsupported broadcast. | |||
| 9968 | return SDValue(); | |||
| 9969 | } | |||
| 9970 | ||||
| 9971 | /// For an EXTRACT_VECTOR_ELT with a constant index return the real | |||
| 9972 | /// underlying vector and index. | |||
| 9973 | /// | |||
| 9974 | /// Modifies \p ExtractedFromVec to the real vector and returns the real | |||
| 9975 | /// index. | |||
| 9976 | static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, | |||
| 9977 | SDValue ExtIdx) { | |||
| 9978 | int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue(); | |||
| 9979 | if (!isa<ShuffleVectorSDNode>(ExtractedFromVec)) | |||
| 9980 | return Idx; | |||
| 9981 | ||||
| 9982 | // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already | |||
| 9983 | // lowered this: | |||
| 9984 | // (extract_vector_elt (v8f32 %1), Constant<6>) | |||
| 9985 | // to: | |||
| 9986 | // (extract_vector_elt (vector_shuffle<2,u,u,u> | |||
| 9987 | // (extract_subvector (v8f32 %0), Constant<4>), | |||
| 9988 | // undef) | |||
| 9989 | // Constant<0>) | |||
| 9990 | // In this case the vector is the extract_subvector expression and the index | |||
| 9991 | // is 2, as specified by the shuffle. | |||
| 9992 | ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec); | |||
| 9993 | SDValue ShuffleVec = SVOp->getOperand(0); | |||
| 9994 | MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); | |||
| 9995 | assert(ShuffleVecVT.getVectorElementType() ==(static_cast <bool> (ShuffleVecVT.getVectorElementType( ) == ExtractedFromVec.getSimpleValueType().getVectorElementType ()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__ __PRETTY_FUNCTION__)) | |||
| 9996 | ExtractedFromVec.getSimpleValueType().getVectorElementType())(static_cast <bool> (ShuffleVecVT.getVectorElementType( ) == ExtractedFromVec.getSimpleValueType().getVectorElementType ()) ? void (0) : __assert_fail ("ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 9996, __extension__ __PRETTY_FUNCTION__)); | |||
| 9997 | ||||
| 9998 | int ShuffleIdx = SVOp->getMaskElt(Idx); | |||
| 9999 | if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { | |||
| 10000 | ExtractedFromVec = ShuffleVec; | |||
| 10001 | return ShuffleIdx; | |||
| 10002 | } | |||
| 10003 | return Idx; | |||
| 10004 | } | |||
| 10005 | ||||
| 10006 | static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { | |||
| 10007 | MVT VT = Op.getSimpleValueType(); | |||
| 10008 | ||||
| 10009 | // Skip if insert_vec_elt is not supported. | |||
| 10010 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 10011 | if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) | |||
| 10012 | return SDValue(); | |||
| 10013 | ||||
| 10014 | SDLoc DL(Op); | |||
| 10015 | unsigned NumElems = Op.getNumOperands(); | |||
| 10016 | ||||
| 10017 | SDValue VecIn1; | |||
| 10018 | SDValue VecIn2; | |||
| 10019 | SmallVector<unsigned, 4> InsertIndices; | |||
| 10020 | SmallVector<int, 8> Mask(NumElems, -1); | |||
| 10021 | ||||
| 10022 | for (unsigned i = 0; i != NumElems; ++i) { | |||
| 10023 | unsigned Opc = Op.getOperand(i).getOpcode(); | |||
| 10024 | ||||
| 10025 | if (Opc == ISD::UNDEF) | |||
| 10026 | continue; | |||
| 10027 | ||||
| 10028 | if (Opc != ISD::EXTRACT_VECTOR_ELT) { | |||
| 10029 | // Quit if more than 1 elements need inserting. | |||
| 10030 | if (InsertIndices.size() > 1) | |||
| 10031 | return SDValue(); | |||
| 10032 | ||||
| 10033 | InsertIndices.push_back(i); | |||
| 10034 | continue; | |||
| 10035 | } | |||
| 10036 | ||||
| 10037 | SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); | |||
| 10038 | SDValue ExtIdx = Op.getOperand(i).getOperand(1); | |||
| 10039 | ||||
| 10040 | // Quit if non-constant index. | |||
| 10041 | if (!isa<ConstantSDNode>(ExtIdx)) | |||
| 10042 | return SDValue(); | |||
| 10043 | int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); | |||
| 10044 | ||||
| 10045 | // Quit if extracted from vector of different type. | |||
| 10046 | if (ExtractedFromVec.getValueType() != VT) | |||
| 10047 | return SDValue(); | |||
| 10048 | ||||
| 10049 | if (!VecIn1.getNode()) | |||
| 10050 | VecIn1 = ExtractedFromVec; | |||
| 10051 | else if (VecIn1 != ExtractedFromVec) { | |||
| 10052 | if (!VecIn2.getNode()) | |||
| 10053 | VecIn2 = ExtractedFromVec; | |||
| 10054 | else if (VecIn2 != ExtractedFromVec) | |||
| 10055 | // Quit if more than 2 vectors to shuffle | |||
| 10056 | return SDValue(); | |||
| 10057 | } | |||
| 10058 | ||||
| 10059 | if (ExtractedFromVec == VecIn1) | |||
| 10060 | Mask[i] = Idx; | |||
| 10061 | else if (ExtractedFromVec == VecIn2) | |||
| 10062 | Mask[i] = Idx + NumElems; | |||
| 10063 | } | |||
| 10064 | ||||
| 10065 | if (!VecIn1.getNode()) | |||
| 10066 | return SDValue(); | |||
| 10067 | ||||
| 10068 | VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); | |||
| 10069 | SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask); | |||
| 10070 | ||||
| 10071 | for (unsigned Idx : InsertIndices) | |||
| 10072 | NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), | |||
| 10073 | DAG.getIntPtrConstant(Idx, DL)); | |||
| 10074 | ||||
| 10075 | return NV; | |||
| 10076 | } | |||
| 10077 | ||||
| 10078 | // Lower BUILD_VECTOR operation for v8bf16, v16bf16 and v32bf16 types. | |||
| 10079 | static SDValue LowerBUILD_VECTORvXbf16(SDValue Op, SelectionDAG &DAG, | |||
| 10080 | const X86Subtarget &Subtarget) { | |||
| 10081 | MVT VT = Op.getSimpleValueType(); | |||
| 10082 | MVT IVT = VT.changeVectorElementTypeToInteger(); | |||
| 10083 | SmallVector<SDValue, 16> NewOps; | |||
| 10084 | for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) | |||
| 10085 | NewOps.push_back(DAG.getBitcast(MVT::i16, Op.getOperand(I))); | |||
| 10086 | SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(), IVT, NewOps); | |||
| 10087 | return DAG.getBitcast(VT, Res); | |||
| 10088 | } | |||
| 10089 | ||||
| 10090 | // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. | |||
| 10091 | static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG, | |||
| 10092 | const X86Subtarget &Subtarget) { | |||
| 10093 | ||||
| 10094 | MVT VT = Op.getSimpleValueType(); | |||
| 10095 | assert((VT.getVectorElementType() == MVT::i1) &&(static_cast <bool> ((VT.getVectorElementType() == MVT:: i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__ __PRETTY_FUNCTION__)) | |||
| 10096 | "Unexpected type in LowerBUILD_VECTORvXi1!")(static_cast <bool> ((VT.getVectorElementType() == MVT:: i1) && "Unexpected type in LowerBUILD_VECTORvXi1!") ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i1) && \"Unexpected type in LowerBUILD_VECTORvXi1!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10096, __extension__ __PRETTY_FUNCTION__)); | |||
| 10097 | ||||
| 10098 | SDLoc dl(Op); | |||
| 10099 | if (ISD::isBuildVectorAllZeros(Op.getNode()) || | |||
| 10100 | ISD::isBuildVectorAllOnes(Op.getNode())) | |||
| 10101 | return Op; | |||
| 10102 | ||||
| 10103 | uint64_t Immediate = 0; | |||
| 10104 | SmallVector<unsigned, 16> NonConstIdx; | |||
| 10105 | bool IsSplat = true; | |||
| 10106 | bool HasConstElts = false; | |||
| 10107 | int SplatIdx = -1; | |||
| 10108 | for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { | |||
| 10109 | SDValue In = Op.getOperand(idx); | |||
| 10110 | if (In.isUndef()) | |||
| 10111 | continue; | |||
| 10112 | if (auto *InC = dyn_cast<ConstantSDNode>(In)) { | |||
| 10113 | Immediate |= (InC->getZExtValue() & 0x1) << idx; | |||
| 10114 | HasConstElts = true; | |||
| 10115 | } else { | |||
| 10116 | NonConstIdx.push_back(idx); | |||
| 10117 | } | |||
| 10118 | if (SplatIdx < 0) | |||
| 10119 | SplatIdx = idx; | |||
| 10120 | else if (In != Op.getOperand(SplatIdx)) | |||
| 10121 | IsSplat = false; | |||
| 10122 | } | |||
| 10123 | ||||
| 10124 | // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" | |||
| 10125 | if (IsSplat) { | |||
| 10126 | // The build_vector allows the scalar element to be larger than the vector | |||
| 10127 | // element type. We need to mask it to use as a condition unless we know | |||
| 10128 | // the upper bits are zero. | |||
| 10129 | // FIXME: Use computeKnownBits instead of checking specific opcode? | |||
| 10130 | SDValue Cond = Op.getOperand(SplatIdx); | |||
| 10131 | assert(Cond.getValueType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Cond.getValueType() == MVT::i8 && "Unexpected VT!") ? void (0) : __assert_fail ("Cond.getValueType() == MVT::i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10131, __extension__ __PRETTY_FUNCTION__)); | |||
| 10132 | if (Cond.getOpcode() != ISD::SETCC) | |||
| 10133 | Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond, | |||
| 10134 | DAG.getConstant(1, dl, MVT::i8)); | |||
| 10135 | ||||
| 10136 | // Perform the select in the scalar domain so we can use cmov. | |||
| 10137 | if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { | |||
| 10138 | SDValue Select = DAG.getSelect(dl, MVT::i32, Cond, | |||
| 10139 | DAG.getAllOnesConstant(dl, MVT::i32), | |||
| 10140 | DAG.getConstant(0, dl, MVT::i32)); | |||
| 10141 | Select = DAG.getBitcast(MVT::v32i1, Select); | |||
| 10142 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select); | |||
| 10143 | } else { | |||
| 10144 | MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); | |||
| 10145 | SDValue Select = DAG.getSelect(dl, ImmVT, Cond, | |||
| 10146 | DAG.getAllOnesConstant(dl, ImmVT), | |||
| 10147 | DAG.getConstant(0, dl, ImmVT)); | |||
| 10148 | MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; | |||
| 10149 | Select = DAG.getBitcast(VecVT, Select); | |||
| 10150 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select, | |||
| 10151 | DAG.getIntPtrConstant(0, dl)); | |||
| 10152 | } | |||
| 10153 | } | |||
| 10154 | ||||
| 10155 | // insert elements one by one | |||
| 10156 | SDValue DstVec; | |||
| 10157 | if (HasConstElts) { | |||
| 10158 | if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { | |||
| 10159 | SDValue ImmL = DAG.getConstant(Lo_32(Immediate), dl, MVT::i32); | |||
| 10160 | SDValue ImmH = DAG.getConstant(Hi_32(Immediate), dl, MVT::i32); | |||
| 10161 | ImmL = DAG.getBitcast(MVT::v32i1, ImmL); | |||
| 10162 | ImmH = DAG.getBitcast(MVT::v32i1, ImmH); | |||
| 10163 | DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH); | |||
| 10164 | } else { | |||
| 10165 | MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U)); | |||
| 10166 | SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT); | |||
| 10167 | MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1; | |||
| 10168 | DstVec = DAG.getBitcast(VecVT, Imm); | |||
| 10169 | DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, DstVec, | |||
| 10170 | DAG.getIntPtrConstant(0, dl)); | |||
| 10171 | } | |||
| 10172 | } else | |||
| 10173 | DstVec = DAG.getUNDEF(VT); | |||
| 10174 | ||||
| 10175 | for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) { | |||
| 10176 | unsigned InsertIdx = NonConstIdx[i]; | |||
| 10177 | DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, | |||
| 10178 | Op.getOperand(InsertIdx), | |||
| 10179 | DAG.getIntPtrConstant(InsertIdx, dl)); | |||
| 10180 | } | |||
| 10181 | return DstVec; | |||
| 10182 | } | |||
| 10183 | ||||
| 10184 | LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) static bool isHorizOp(unsigned Opcode) { | |||
| 10185 | switch (Opcode) { | |||
| 10186 | case X86ISD::PACKSS: | |||
| 10187 | case X86ISD::PACKUS: | |||
| 10188 | case X86ISD::FHADD: | |||
| 10189 | case X86ISD::FHSUB: | |||
| 10190 | case X86ISD::HADD: | |||
| 10191 | case X86ISD::HSUB: | |||
| 10192 | return true; | |||
| 10193 | } | |||
| 10194 | return false; | |||
| 10195 | } | |||
| 10196 | ||||
| 10197 | /// This is a helper function of LowerToHorizontalOp(). | |||
| 10198 | /// This function checks that the build_vector \p N in input implements a | |||
| 10199 | /// 128-bit partial horizontal operation on a 256-bit vector, but that operation | |||
| 10200 | /// may not match the layout of an x86 256-bit horizontal instruction. | |||
| 10201 | /// In other words, if this returns true, then some extraction/insertion will | |||
| 10202 | /// be required to produce a valid horizontal instruction. | |||
| 10203 | /// | |||
| 10204 | /// Parameter \p Opcode defines the kind of horizontal operation to match. | |||
| 10205 | /// For example, if \p Opcode is equal to ISD::ADD, then this function | |||
| 10206 | /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode | |||
| 10207 | /// is equal to ISD::SUB, then this function checks if this is a horizontal | |||
| 10208 | /// arithmetic sub. | |||
| 10209 | /// | |||
| 10210 | /// This function only analyzes elements of \p N whose indices are | |||
| 10211 | /// in range [BaseIdx, LastIdx). | |||
| 10212 | /// | |||
| 10213 | /// TODO: This function was originally used to match both real and fake partial | |||
| 10214 | /// horizontal operations, but the index-matching logic is incorrect for that. | |||
| 10215 | /// See the corrected implementation in isHopBuildVector(). Can we reduce this | |||
| 10216 | /// code because it is only used for partial h-op matching now? | |||
| 10217 | static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode, | |||
| 10218 | SelectionDAG &DAG, | |||
| 10219 | unsigned BaseIdx, unsigned LastIdx, | |||
| 10220 | SDValue &V0, SDValue &V1) { | |||
| 10221 | EVT VT = N->getValueType(0); | |||
| 10222 | assert(VT.is256BitVector() && "Only use for matching partial 256-bit h-ops")(static_cast <bool> (VT.is256BitVector() && "Only use for matching partial 256-bit h-ops" ) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only use for matching partial 256-bit h-ops\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10222, __extension__ __PRETTY_FUNCTION__)); | |||
| 10223 | assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!")(static_cast <bool> (BaseIdx * 2 <= LastIdx && "Invalid Indices in input!") ? void (0) : __assert_fail ("BaseIdx * 2 <= LastIdx && \"Invalid Indices in input!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10223, __extension__ __PRETTY_FUNCTION__)); | |||
| 10224 | assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&(static_cast <bool> (VT.isVector() && VT.getVectorNumElements () >= LastIdx && "Invalid Vector in input!") ? void (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__ __PRETTY_FUNCTION__)) | |||
| 10225 | "Invalid Vector in input!")(static_cast <bool> (VT.isVector() && VT.getVectorNumElements () >= LastIdx && "Invalid Vector in input!") ? void (0) : __assert_fail ("VT.isVector() && VT.getVectorNumElements() >= LastIdx && \"Invalid Vector in input!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10225, __extension__ __PRETTY_FUNCTION__)); | |||
| 10226 | ||||
| 10227 | bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); | |||
| 10228 | bool CanFold = true; | |||
| 10229 | unsigned ExpectedVExtractIdx = BaseIdx; | |||
| 10230 | unsigned NumElts = LastIdx - BaseIdx; | |||
| 10231 | V0 = DAG.getUNDEF(VT); | |||
| 10232 | V1 = DAG.getUNDEF(VT); | |||
| 10233 | ||||
| 10234 | // Check if N implements a horizontal binop. | |||
| 10235 | for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { | |||
| 10236 | SDValue Op = N->getOperand(i + BaseIdx); | |||
| 10237 | ||||
| 10238 | // Skip UNDEFs. | |||
| 10239 | if (Op->isUndef()) { | |||
| 10240 | // Update the expected vector extract index. | |||
| 10241 | if (i * 2 == NumElts) | |||
| 10242 | ExpectedVExtractIdx = BaseIdx; | |||
| 10243 | ExpectedVExtractIdx += 2; | |||
| 10244 | continue; | |||
| 10245 | } | |||
| 10246 | ||||
| 10247 | CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); | |||
| 10248 | ||||
| 10249 | if (!CanFold) | |||
| 10250 | break; | |||
| 10251 | ||||
| 10252 | SDValue Op0 = Op.getOperand(0); | |||
| 10253 | SDValue Op1 = Op.getOperand(1); | |||
| 10254 | ||||
| 10255 | // Try to match the following pattern: | |||
| 10256 | // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) | |||
| 10257 | CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 10258 | Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 10259 | Op0.getOperand(0) == Op1.getOperand(0) && | |||
| 10260 | isa<ConstantSDNode>(Op0.getOperand(1)) && | |||
| 10261 | isa<ConstantSDNode>(Op1.getOperand(1))); | |||
| 10262 | if (!CanFold) | |||
| 10263 | break; | |||
| 10264 | ||||
| 10265 | unsigned I0 = Op0.getConstantOperandVal(1); | |||
| 10266 | unsigned I1 = Op1.getConstantOperandVal(1); | |||
| 10267 | ||||
| 10268 | if (i * 2 < NumElts) { | |||
| 10269 | if (V0.isUndef()) { | |||
| 10270 | V0 = Op0.getOperand(0); | |||
| 10271 | if (V0.getValueType() != VT) | |||
| 10272 | return false; | |||
| 10273 | } | |||
| 10274 | } else { | |||
| 10275 | if (V1.isUndef()) { | |||
| 10276 | V1 = Op0.getOperand(0); | |||
| 10277 | if (V1.getValueType() != VT) | |||
| 10278 | return false; | |||
| 10279 | } | |||
| 10280 | if (i * 2 == NumElts) | |||
| 10281 | ExpectedVExtractIdx = BaseIdx; | |||
| 10282 | } | |||
| 10283 | ||||
| 10284 | SDValue Expected = (i * 2 < NumElts) ? V0 : V1; | |||
| 10285 | if (I0 == ExpectedVExtractIdx) | |||
| 10286 | CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; | |||
| 10287 | else if (IsCommutable && I1 == ExpectedVExtractIdx) { | |||
| 10288 | // Try to match the following dag sequence: | |||
| 10289 | // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) | |||
| 10290 | CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; | |||
| 10291 | } else | |||
| 10292 | CanFold = false; | |||
| 10293 | ||||
| 10294 | ExpectedVExtractIdx += 2; | |||
| 10295 | } | |||
| 10296 | ||||
| 10297 | return CanFold; | |||
| 10298 | } | |||
| 10299 | ||||
| 10300 | /// Emit a sequence of two 128-bit horizontal add/sub followed by | |||
| 10301 | /// a concat_vector. | |||
| 10302 | /// | |||
| 10303 | /// This is a helper function of LowerToHorizontalOp(). | |||
| 10304 | /// This function expects two 256-bit vectors called V0 and V1. | |||
| 10305 | /// At first, each vector is split into two separate 128-bit vectors. | |||
| 10306 | /// Then, the resulting 128-bit vectors are used to implement two | |||
| 10307 | /// horizontal binary operations. | |||
| 10308 | /// | |||
| 10309 | /// The kind of horizontal binary operation is defined by \p X86Opcode. | |||
| 10310 | /// | |||
| 10311 | /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to | |||
| 10312 | /// the two new horizontal binop. | |||
| 10313 | /// When Mode is set, the first horizontal binop dag node would take as input | |||
| 10314 | /// the lower 128-bit of V0 and the upper 128-bit of V0. The second | |||
| 10315 | /// horizontal binop dag node would take as input the lower 128-bit of V1 | |||
| 10316 | /// and the upper 128-bit of V1. | |||
| 10317 | /// Example: | |||
| 10318 | /// HADD V0_LO, V0_HI | |||
| 10319 | /// HADD V1_LO, V1_HI | |||
| 10320 | /// | |||
| 10321 | /// Otherwise, the first horizontal binop dag node takes as input the lower | |||
| 10322 | /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop | |||
| 10323 | /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1. | |||
| 10324 | /// Example: | |||
| 10325 | /// HADD V0_LO, V1_LO | |||
| 10326 | /// HADD V0_HI, V1_HI | |||
| 10327 | /// | |||
| 10328 | /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower | |||
| 10329 | /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to | |||
| 10330 | /// the upper 128-bits of the result. | |||
| 10331 | static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, | |||
| 10332 | const SDLoc &DL, SelectionDAG &DAG, | |||
| 10333 | unsigned X86Opcode, bool Mode, | |||
| 10334 | bool isUndefLO, bool isUndefHI) { | |||
| 10335 | MVT VT = V0.getSimpleValueType(); | |||
| 10336 | assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&(static_cast <bool> (VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!" ) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__ __PRETTY_FUNCTION__)) | |||
| 10337 | "Invalid nodes in input!")(static_cast <bool> (VT.is256BitVector() && VT == V1.getSimpleValueType() && "Invalid nodes in input!" ) ? void (0) : __assert_fail ("VT.is256BitVector() && VT == V1.getSimpleValueType() && \"Invalid nodes in input!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10337, __extension__ __PRETTY_FUNCTION__)); | |||
| 10338 | ||||
| 10339 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10340 | SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL); | |||
| 10341 | SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL); | |||
| 10342 | SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL); | |||
| 10343 | SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL); | |||
| 10344 | MVT NewVT = V0_LO.getSimpleValueType(); | |||
| 10345 | ||||
| 10346 | SDValue LO = DAG.getUNDEF(NewVT); | |||
| 10347 | SDValue HI = DAG.getUNDEF(NewVT); | |||
| 10348 | ||||
| 10349 | if (Mode) { | |||
| 10350 | // Don't emit a horizontal binop if the result is expected to be UNDEF. | |||
| 10351 | if (!isUndefLO && !V0->isUndef()) | |||
| 10352 | LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); | |||
| 10353 | if (!isUndefHI && !V1->isUndef()) | |||
| 10354 | HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); | |||
| 10355 | } else { | |||
| 10356 | // Don't emit a horizontal binop if the result is expected to be UNDEF. | |||
| 10357 | if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef())) | |||
| 10358 | LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); | |||
| 10359 | ||||
| 10360 | if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef())) | |||
| 10361 | HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); | |||
| 10362 | } | |||
| 10363 | ||||
| 10364 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); | |||
| 10365 | } | |||
| 10366 | ||||
| 10367 | /// Returns true iff \p BV builds a vector with the result equivalent to | |||
| 10368 | /// the result of ADDSUB/SUBADD operation. | |||
| 10369 | /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 | |||
| 10370 | /// (SUBADD = Opnd0 -+ Opnd1) operation are written to the parameters | |||
| 10371 | /// \p Opnd0 and \p Opnd1. | |||
| 10372 | static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV, | |||
| 10373 | const X86Subtarget &Subtarget, SelectionDAG &DAG, | |||
| 10374 | SDValue &Opnd0, SDValue &Opnd1, | |||
| 10375 | unsigned &NumExtracts, | |||
| 10376 | bool &IsSubAdd) { | |||
| 10377 | ||||
| 10378 | MVT VT = BV->getSimpleValueType(0); | |||
| 10379 | if (!Subtarget.hasSSE3() || !VT.isFloatingPoint()) | |||
| 10380 | return false; | |||
| 10381 | ||||
| 10382 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10383 | SDValue InVec0 = DAG.getUNDEF(VT); | |||
| 10384 | SDValue InVec1 = DAG.getUNDEF(VT); | |||
| 10385 | ||||
| 10386 | NumExtracts = 0; | |||
| 10387 | ||||
| 10388 | // Odd-numbered elements in the input build vector are obtained from | |||
| 10389 | // adding/subtracting two integer/float elements. | |||
| 10390 | // Even-numbered elements in the input build vector are obtained from | |||
| 10391 | // subtracting/adding two integer/float elements. | |||
| 10392 | unsigned Opc[2] = {0, 0}; | |||
| 10393 | for (unsigned i = 0, e = NumElts; i != e; ++i) { | |||
| 10394 | SDValue Op = BV->getOperand(i); | |||
| 10395 | ||||
| 10396 | // Skip 'undef' values. | |||
| 10397 | unsigned Opcode = Op.getOpcode(); | |||
| 10398 | if (Opcode == ISD::UNDEF) | |||
| 10399 | continue; | |||
| 10400 | ||||
| 10401 | // Early exit if we found an unexpected opcode. | |||
| 10402 | if (Opcode != ISD::FADD && Opcode != ISD::FSUB) | |||
| 10403 | return false; | |||
| 10404 | ||||
| 10405 | SDValue Op0 = Op.getOperand(0); | |||
| 10406 | SDValue Op1 = Op.getOperand(1); | |||
| 10407 | ||||
| 10408 | // Try to match the following pattern: | |||
| 10409 | // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) | |||
| 10410 | // Early exit if we cannot match that sequence. | |||
| 10411 | if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 10412 | Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 10413 | !isa<ConstantSDNode>(Op0.getOperand(1)) || | |||
| 10414 | Op0.getOperand(1) != Op1.getOperand(1)) | |||
| 10415 | return false; | |||
| 10416 | ||||
| 10417 | unsigned I0 = Op0.getConstantOperandVal(1); | |||
| 10418 | if (I0 != i) | |||
| 10419 | return false; | |||
| 10420 | ||||
| 10421 | // We found a valid add/sub node, make sure its the same opcode as previous | |||
| 10422 | // elements for this parity. | |||
| 10423 | if (Opc[i % 2] != 0 && Opc[i % 2] != Opcode) | |||
| 10424 | return false; | |||
| 10425 | Opc[i % 2] = Opcode; | |||
| 10426 | ||||
| 10427 | // Update InVec0 and InVec1. | |||
| 10428 | if (InVec0.isUndef()) { | |||
| 10429 | InVec0 = Op0.getOperand(0); | |||
| 10430 | if (InVec0.getSimpleValueType() != VT) | |||
| 10431 | return false; | |||
| 10432 | } | |||
| 10433 | if (InVec1.isUndef()) { | |||
| 10434 | InVec1 = Op1.getOperand(0); | |||
| 10435 | if (InVec1.getSimpleValueType() != VT) | |||
| 10436 | return false; | |||
| 10437 | } | |||
| 10438 | ||||
| 10439 | // Make sure that operands in input to each add/sub node always | |||
| 10440 | // come from a same pair of vectors. | |||
| 10441 | if (InVec0 != Op0.getOperand(0)) { | |||
| 10442 | if (Opcode == ISD::FSUB) | |||
| 10443 | return false; | |||
| 10444 | ||||
| 10445 | // FADD is commutable. Try to commute the operands | |||
| 10446 | // and then test again. | |||
| 10447 | std::swap(Op0, Op1); | |||
| 10448 | if (InVec0 != Op0.getOperand(0)) | |||
| 10449 | return false; | |||
| 10450 | } | |||
| 10451 | ||||
| 10452 | if (InVec1 != Op1.getOperand(0)) | |||
| 10453 | return false; | |||
| 10454 | ||||
| 10455 | // Increment the number of extractions done. | |||
| 10456 | ++NumExtracts; | |||
| 10457 | } | |||
| 10458 | ||||
| 10459 | // Ensure we have found an opcode for both parities and that they are | |||
| 10460 | // different. Don't try to fold this build_vector into an ADDSUB/SUBADD if the | |||
| 10461 | // inputs are undef. | |||
| 10462 | if (!Opc[0] || !Opc[1] || Opc[0] == Opc[1] || | |||
| 10463 | InVec0.isUndef() || InVec1.isUndef()) | |||
| 10464 | return false; | |||
| 10465 | ||||
| 10466 | IsSubAdd = Opc[0] == ISD::FADD; | |||
| 10467 | ||||
| 10468 | Opnd0 = InVec0; | |||
| 10469 | Opnd1 = InVec1; | |||
| 10470 | return true; | |||
| 10471 | } | |||
| 10472 | ||||
| 10473 | /// Returns true if is possible to fold MUL and an idiom that has already been | |||
| 10474 | /// recognized as ADDSUB/SUBADD(\p Opnd0, \p Opnd1) into | |||
| 10475 | /// FMADDSUB/FMSUBADD(x, y, \p Opnd1). If (and only if) true is returned, the | |||
| 10476 | /// operands of FMADDSUB/FMSUBADD are written to parameters \p Opnd0, \p Opnd1, \p Opnd2. | |||
| 10477 | /// | |||
| 10478 | /// Prior to calling this function it should be known that there is some | |||
| 10479 | /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation | |||
| 10480 | /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called | |||
| 10481 | /// before replacement of such SDNode with ADDSUB operation. Thus the number | |||
| 10482 | /// of \p Opnd0 uses is expected to be equal to 2. | |||
| 10483 | /// For example, this function may be called for the following IR: | |||
| 10484 | /// %AB = fmul fast <2 x double> %A, %B | |||
| 10485 | /// %Sub = fsub fast <2 x double> %AB, %C | |||
| 10486 | /// %Add = fadd fast <2 x double> %AB, %C | |||
| 10487 | /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add, | |||
| 10488 | /// <2 x i32> <i32 0, i32 3> | |||
| 10489 | /// There is a def for %Addsub here, which potentially can be replaced by | |||
| 10490 | /// X86ISD::ADDSUB operation: | |||
| 10491 | /// %Addsub = X86ISD::ADDSUB %AB, %C | |||
| 10492 | /// and such ADDSUB can further be replaced with FMADDSUB: | |||
| 10493 | /// %Addsub = FMADDSUB %A, %B, %C. | |||
| 10494 | /// | |||
| 10495 | /// The main reason why this method is called before the replacement of the | |||
| 10496 | /// recognized ADDSUB idiom with ADDSUB operation is that such replacement | |||
| 10497 | /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit | |||
| 10498 | /// FMADDSUB is. | |||
| 10499 | static bool isFMAddSubOrFMSubAdd(const X86Subtarget &Subtarget, | |||
| 10500 | SelectionDAG &DAG, | |||
| 10501 | SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, | |||
| 10502 | unsigned ExpectedUses) { | |||
| 10503 | if (Opnd0.getOpcode() != ISD::FMUL || | |||
| 10504 | !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) | |||
| 10505 | return false; | |||
| 10506 | ||||
| 10507 | // FIXME: These checks must match the similar ones in | |||
| 10508 | // DAGCombiner::visitFADDForFMACombine. It would be good to have one | |||
| 10509 | // function that would answer if it is Ok to fuse MUL + ADD to FMADD | |||
| 10510 | // or MUL + ADDSUB to FMADDSUB. | |||
| 10511 | const TargetOptions &Options = DAG.getTarget().Options; | |||
| 10512 | bool AllowFusion = | |||
| 10513 | (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath); | |||
| 10514 | if (!AllowFusion) | |||
| 10515 | return false; | |||
| 10516 | ||||
| 10517 | Opnd2 = Opnd1; | |||
| 10518 | Opnd1 = Opnd0.getOperand(1); | |||
| 10519 | Opnd0 = Opnd0.getOperand(0); | |||
| 10520 | ||||
| 10521 | return true; | |||
| 10522 | } | |||
| 10523 | ||||
| 10524 | /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' or | |||
| 10525 | /// 'fsubadd' operation accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB or | |||
| 10526 | /// X86ISD::FMSUBADD node. | |||
| 10527 | static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, | |||
| 10528 | const X86Subtarget &Subtarget, | |||
| 10529 | SelectionDAG &DAG) { | |||
| 10530 | SDValue Opnd0, Opnd1; | |||
| 10531 | unsigned NumExtracts; | |||
| 10532 | bool IsSubAdd; | |||
| 10533 | if (!isAddSubOrSubAdd(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts, | |||
| 10534 | IsSubAdd)) | |||
| 10535 | return SDValue(); | |||
| 10536 | ||||
| 10537 | MVT VT = BV->getSimpleValueType(0); | |||
| 10538 | SDLoc DL(BV); | |||
| 10539 | ||||
| 10540 | // Try to generate X86ISD::FMADDSUB node here. | |||
| 10541 | SDValue Opnd2; | |||
| 10542 | if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) { | |||
| 10543 | unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; | |||
| 10544 | return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); | |||
| 10545 | } | |||
| 10546 | ||||
| 10547 | // We only support ADDSUB. | |||
| 10548 | if (IsSubAdd) | |||
| 10549 | return SDValue(); | |||
| 10550 | ||||
| 10551 | // There are no known X86 targets with 512-bit ADDSUB instructions! | |||
| 10552 | // Convert to blend(fsub,fadd). | |||
| 10553 | if (VT.is512BitVector()) { | |||
| 10554 | SmallVector<int> Mask; | |||
| 10555 | for (int I = 0, E = VT.getVectorNumElements(); I != E; I += 2) { | |||
| 10556 | Mask.push_back(I); | |||
| 10557 | Mask.push_back(I + E + 1); | |||
| 10558 | } | |||
| 10559 | SDValue Sub = DAG.getNode(ISD::FSUB, DL, VT, Opnd0, Opnd1); | |||
| 10560 | SDValue Add = DAG.getNode(ISD::FADD, DL, VT, Opnd0, Opnd1); | |||
| 10561 | return DAG.getVectorShuffle(VT, DL, Sub, Add, Mask); | |||
| 10562 | } | |||
| 10563 | ||||
| 10564 | return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); | |||
| 10565 | } | |||
| 10566 | ||||
| 10567 | static bool isHopBuildVector(const BuildVectorSDNode *BV, SelectionDAG &DAG, | |||
| 10568 | unsigned &HOpcode, SDValue &V0, SDValue &V1) { | |||
| 10569 | // Initialize outputs to known values. | |||
| 10570 | MVT VT = BV->getSimpleValueType(0); | |||
| 10571 | HOpcode = ISD::DELETED_NODE; | |||
| 10572 | V0 = DAG.getUNDEF(VT); | |||
| 10573 | V1 = DAG.getUNDEF(VT); | |||
| 10574 | ||||
| 10575 | // x86 256-bit horizontal ops are defined in a non-obvious way. Each 128-bit | |||
| 10576 | // half of the result is calculated independently from the 128-bit halves of | |||
| 10577 | // the inputs, so that makes the index-checking logic below more complicated. | |||
| 10578 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10579 | unsigned GenericOpcode = ISD::DELETED_NODE; | |||
| 10580 | unsigned Num128BitChunks = VT.is256BitVector() ? 2 : 1; | |||
| 10581 | unsigned NumEltsIn128Bits = NumElts / Num128BitChunks; | |||
| 10582 | unsigned NumEltsIn64Bits = NumEltsIn128Bits / 2; | |||
| 10583 | for (unsigned i = 0; i != Num128BitChunks; ++i) { | |||
| 10584 | for (unsigned j = 0; j != NumEltsIn128Bits; ++j) { | |||
| 10585 | // Ignore undef elements. | |||
| 10586 | SDValue Op = BV->getOperand(i * NumEltsIn128Bits + j); | |||
| 10587 | if (Op.isUndef()) | |||
| 10588 | continue; | |||
| 10589 | ||||
| 10590 | // If there's an opcode mismatch, we're done. | |||
| 10591 | if (HOpcode != ISD::DELETED_NODE && Op.getOpcode() != GenericOpcode) | |||
| 10592 | return false; | |||
| 10593 | ||||
| 10594 | // Initialize horizontal opcode. | |||
| 10595 | if (HOpcode == ISD::DELETED_NODE) { | |||
| 10596 | GenericOpcode = Op.getOpcode(); | |||
| 10597 | switch (GenericOpcode) { | |||
| 10598 | case ISD::ADD: HOpcode = X86ISD::HADD; break; | |||
| 10599 | case ISD::SUB: HOpcode = X86ISD::HSUB; break; | |||
| 10600 | case ISD::FADD: HOpcode = X86ISD::FHADD; break; | |||
| 10601 | case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; | |||
| 10602 | default: return false; | |||
| 10603 | } | |||
| 10604 | } | |||
| 10605 | ||||
| 10606 | SDValue Op0 = Op.getOperand(0); | |||
| 10607 | SDValue Op1 = Op.getOperand(1); | |||
| 10608 | if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 10609 | Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 10610 | Op0.getOperand(0) != Op1.getOperand(0) || | |||
| 10611 | !isa<ConstantSDNode>(Op0.getOperand(1)) || | |||
| 10612 | !isa<ConstantSDNode>(Op1.getOperand(1)) || !Op.hasOneUse()) | |||
| 10613 | return false; | |||
| 10614 | ||||
| 10615 | // The source vector is chosen based on which 64-bit half of the | |||
| 10616 | // destination vector is being calculated. | |||
| 10617 | if (j < NumEltsIn64Bits) { | |||
| 10618 | if (V0.isUndef()) | |||
| 10619 | V0 = Op0.getOperand(0); | |||
| 10620 | } else { | |||
| 10621 | if (V1.isUndef()) | |||
| 10622 | V1 = Op0.getOperand(0); | |||
| 10623 | } | |||
| 10624 | ||||
| 10625 | SDValue SourceVec = (j < NumEltsIn64Bits) ? V0 : V1; | |||
| 10626 | if (SourceVec != Op0.getOperand(0)) | |||
| 10627 | return false; | |||
| 10628 | ||||
| 10629 | // op (extract_vector_elt A, I), (extract_vector_elt A, I+1) | |||
| 10630 | unsigned ExtIndex0 = Op0.getConstantOperandVal(1); | |||
| 10631 | unsigned ExtIndex1 = Op1.getConstantOperandVal(1); | |||
| 10632 | unsigned ExpectedIndex = i * NumEltsIn128Bits + | |||
| 10633 | (j % NumEltsIn64Bits) * 2; | |||
| 10634 | if (ExpectedIndex == ExtIndex0 && ExtIndex1 == ExtIndex0 + 1) | |||
| 10635 | continue; | |||
| 10636 | ||||
| 10637 | // If this is not a commutative op, this does not match. | |||
| 10638 | if (GenericOpcode != ISD::ADD && GenericOpcode != ISD::FADD) | |||
| 10639 | return false; | |||
| 10640 | ||||
| 10641 | // Addition is commutative, so try swapping the extract indexes. | |||
| 10642 | // op (extract_vector_elt A, I+1), (extract_vector_elt A, I) | |||
| 10643 | if (ExpectedIndex == ExtIndex1 && ExtIndex0 == ExtIndex1 + 1) | |||
| 10644 | continue; | |||
| 10645 | ||||
| 10646 | // Extract indexes do not match horizontal requirement. | |||
| 10647 | return false; | |||
| 10648 | } | |||
| 10649 | } | |||
| 10650 | // We matched. Opcode and operands are returned by reference as arguments. | |||
| 10651 | return true; | |||
| 10652 | } | |||
| 10653 | ||||
| 10654 | static SDValue getHopForBuildVector(const BuildVectorSDNode *BV, | |||
| 10655 | SelectionDAG &DAG, unsigned HOpcode, | |||
| 10656 | SDValue V0, SDValue V1) { | |||
| 10657 | // If either input vector is not the same size as the build vector, | |||
| 10658 | // extract/insert the low bits to the correct size. | |||
| 10659 | // This is free (examples: zmm --> xmm, xmm --> ymm). | |||
| 10660 | MVT VT = BV->getSimpleValueType(0); | |||
| 10661 | unsigned Width = VT.getSizeInBits(); | |||
| 10662 | if (V0.getValueSizeInBits() > Width) | |||
| 10663 | V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), Width); | |||
| 10664 | else if (V0.getValueSizeInBits() < Width) | |||
| 10665 | V0 = insertSubVector(DAG.getUNDEF(VT), V0, 0, DAG, SDLoc(BV), Width); | |||
| 10666 | ||||
| 10667 | if (V1.getValueSizeInBits() > Width) | |||
| 10668 | V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), Width); | |||
| 10669 | else if (V1.getValueSizeInBits() < Width) | |||
| 10670 | V1 = insertSubVector(DAG.getUNDEF(VT), V1, 0, DAG, SDLoc(BV), Width); | |||
| 10671 | ||||
| 10672 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10673 | APInt DemandedElts = APInt::getAllOnes(NumElts); | |||
| 10674 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 10675 | if (BV->getOperand(i).isUndef()) | |||
| 10676 | DemandedElts.clearBit(i); | |||
| 10677 | ||||
| 10678 | // If we don't need the upper xmm, then perform as a xmm hop. | |||
| 10679 | unsigned HalfNumElts = NumElts / 2; | |||
| 10680 | if (VT.is256BitVector() && DemandedElts.lshr(HalfNumElts) == 0) { | |||
| 10681 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 10682 | V0 = extractSubVector(V0, 0, DAG, SDLoc(BV), 128); | |||
| 10683 | V1 = extractSubVector(V1, 0, DAG, SDLoc(BV), 128); | |||
| 10684 | SDValue Half = DAG.getNode(HOpcode, SDLoc(BV), HalfVT, V0, V1); | |||
| 10685 | return insertSubVector(DAG.getUNDEF(VT), Half, 0, DAG, SDLoc(BV), 256); | |||
| 10686 | } | |||
| 10687 | ||||
| 10688 | return DAG.getNode(HOpcode, SDLoc(BV), VT, V0, V1); | |||
| 10689 | } | |||
| 10690 | ||||
| 10691 | /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible. | |||
| 10692 | static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV, | |||
| 10693 | const X86Subtarget &Subtarget, | |||
| 10694 | SelectionDAG &DAG) { | |||
| 10695 | // We need at least 2 non-undef elements to make this worthwhile by default. | |||
| 10696 | unsigned NumNonUndefs = | |||
| 10697 | count_if(BV->op_values(), [](SDValue V) { return !V.isUndef(); }); | |||
| 10698 | if (NumNonUndefs < 2) | |||
| 10699 | return SDValue(); | |||
| 10700 | ||||
| 10701 | // There are 4 sets of horizontal math operations distinguished by type: | |||
| 10702 | // int/FP at 128-bit/256-bit. Each type was introduced with a different | |||
| 10703 | // subtarget feature. Try to match those "native" patterns first. | |||
| 10704 | MVT VT = BV->getSimpleValueType(0); | |||
| 10705 | if (((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) || | |||
| 10706 | ((VT == MVT::v8i16 || VT == MVT::v4i32) && Subtarget.hasSSSE3()) || | |||
| 10707 | ((VT == MVT::v8f32 || VT == MVT::v4f64) && Subtarget.hasAVX()) || | |||
| 10708 | ((VT == MVT::v16i16 || VT == MVT::v8i32) && Subtarget.hasAVX2())) { | |||
| 10709 | unsigned HOpcode; | |||
| 10710 | SDValue V0, V1; | |||
| 10711 | if (isHopBuildVector(BV, DAG, HOpcode, V0, V1)) | |||
| 10712 | return getHopForBuildVector(BV, DAG, HOpcode, V0, V1); | |||
| 10713 | } | |||
| 10714 | ||||
| 10715 | // Try harder to match 256-bit ops by using extract/concat. | |||
| 10716 | if (!Subtarget.hasAVX() || !VT.is256BitVector()) | |||
| 10717 | return SDValue(); | |||
| 10718 | ||||
| 10719 | // Count the number of UNDEF operands in the build_vector in input. | |||
| 10720 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10721 | unsigned Half = NumElts / 2; | |||
| 10722 | unsigned NumUndefsLO = 0; | |||
| 10723 | unsigned NumUndefsHI = 0; | |||
| 10724 | for (unsigned i = 0, e = Half; i != e; ++i) | |||
| 10725 | if (BV->getOperand(i)->isUndef()) | |||
| 10726 | NumUndefsLO++; | |||
| 10727 | ||||
| 10728 | for (unsigned i = Half, e = NumElts; i != e; ++i) | |||
| 10729 | if (BV->getOperand(i)->isUndef()) | |||
| 10730 | NumUndefsHI++; | |||
| 10731 | ||||
| 10732 | SDLoc DL(BV); | |||
| 10733 | SDValue InVec0, InVec1; | |||
| 10734 | if (VT == MVT::v8i32 || VT == MVT::v16i16) { | |||
| 10735 | SDValue InVec2, InVec3; | |||
| 10736 | unsigned X86Opcode; | |||
| 10737 | bool CanFold = true; | |||
| 10738 | ||||
| 10739 | if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && | |||
| 10740 | isHorizontalBinOpPart(BV, ISD::ADD, DAG, Half, NumElts, InVec2, | |||
| 10741 | InVec3) && | |||
| 10742 | ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && | |||
| 10743 | ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) | |||
| 10744 | X86Opcode = X86ISD::HADD; | |||
| 10745 | else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, Half, InVec0, | |||
| 10746 | InVec1) && | |||
| 10747 | isHorizontalBinOpPart(BV, ISD::SUB, DAG, Half, NumElts, InVec2, | |||
| 10748 | InVec3) && | |||
| 10749 | ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) && | |||
| 10750 | ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3)) | |||
| 10751 | X86Opcode = X86ISD::HSUB; | |||
| 10752 | else | |||
| 10753 | CanFold = false; | |||
| 10754 | ||||
| 10755 | if (CanFold) { | |||
| 10756 | // Do not try to expand this build_vector into a pair of horizontal | |||
| 10757 | // add/sub if we can emit a pair of scalar add/sub. | |||
| 10758 | if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) | |||
| 10759 | return SDValue(); | |||
| 10760 | ||||
| 10761 | // Convert this build_vector into a pair of horizontal binops followed by | |||
| 10762 | // a concat vector. We must adjust the outputs from the partial horizontal | |||
| 10763 | // matching calls above to account for undefined vector halves. | |||
| 10764 | SDValue V0 = InVec0.isUndef() ? InVec2 : InVec0; | |||
| 10765 | SDValue V1 = InVec1.isUndef() ? InVec3 : InVec1; | |||
| 10766 | assert((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?")(static_cast <bool> ((!V0.isUndef() || !V1.isUndef()) && "Horizontal-op of undefs?") ? void (0) : __assert_fail ("(!V0.isUndef() || !V1.isUndef()) && \"Horizontal-op of undefs?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10766, __extension__ __PRETTY_FUNCTION__)); | |||
| 10767 | bool isUndefLO = NumUndefsLO == Half; | |||
| 10768 | bool isUndefHI = NumUndefsHI == Half; | |||
| 10769 | return ExpandHorizontalBinOp(V0, V1, DL, DAG, X86Opcode, false, isUndefLO, | |||
| 10770 | isUndefHI); | |||
| 10771 | } | |||
| 10772 | } | |||
| 10773 | ||||
| 10774 | if (VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || | |||
| 10775 | VT == MVT::v16i16) { | |||
| 10776 | unsigned X86Opcode; | |||
| 10777 | if (isHorizontalBinOpPart(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) | |||
| 10778 | X86Opcode = X86ISD::HADD; | |||
| 10779 | else if (isHorizontalBinOpPart(BV, ISD::SUB, DAG, 0, NumElts, InVec0, | |||
| 10780 | InVec1)) | |||
| 10781 | X86Opcode = X86ISD::HSUB; | |||
| 10782 | else if (isHorizontalBinOpPart(BV, ISD::FADD, DAG, 0, NumElts, InVec0, | |||
| 10783 | InVec1)) | |||
| 10784 | X86Opcode = X86ISD::FHADD; | |||
| 10785 | else if (isHorizontalBinOpPart(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, | |||
| 10786 | InVec1)) | |||
| 10787 | X86Opcode = X86ISD::FHSUB; | |||
| 10788 | else | |||
| 10789 | return SDValue(); | |||
| 10790 | ||||
| 10791 | // Don't try to expand this build_vector into a pair of horizontal add/sub | |||
| 10792 | // if we can simply emit a pair of scalar add/sub. | |||
| 10793 | if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) | |||
| 10794 | return SDValue(); | |||
| 10795 | ||||
| 10796 | // Convert this build_vector into two horizontal add/sub followed by | |||
| 10797 | // a concat vector. | |||
| 10798 | bool isUndefLO = NumUndefsLO == Half; | |||
| 10799 | bool isUndefHI = NumUndefsHI == Half; | |||
| 10800 | return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, | |||
| 10801 | isUndefLO, isUndefHI); | |||
| 10802 | } | |||
| 10803 | ||||
| 10804 | return SDValue(); | |||
| 10805 | } | |||
| 10806 | ||||
| 10807 | static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, | |||
| 10808 | SelectionDAG &DAG); | |||
| 10809 | ||||
| 10810 | /// If a BUILD_VECTOR's source elements all apply the same bit operation and | |||
| 10811 | /// one of their operands is constant, lower to a pair of BUILD_VECTOR and | |||
| 10812 | /// just apply the bit to the vectors. | |||
| 10813 | /// NOTE: Its not in our interest to start make a general purpose vectorizer | |||
| 10814 | /// from this, but enough scalar bit operations are created from the later | |||
| 10815 | /// legalization + scalarization stages to need basic support. | |||
| 10816 | static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op, | |||
| 10817 | const X86Subtarget &Subtarget, | |||
| 10818 | SelectionDAG &DAG) { | |||
| 10819 | SDLoc DL(Op); | |||
| 10820 | MVT VT = Op->getSimpleValueType(0); | |||
| 10821 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 10822 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 10823 | ||||
| 10824 | // Check that all elements have the same opcode. | |||
| 10825 | // TODO: Should we allow UNDEFS and if so how many? | |||
| 10826 | unsigned Opcode = Op->getOperand(0).getOpcode(); | |||
| 10827 | for (unsigned i = 1; i < NumElems; ++i) | |||
| 10828 | if (Opcode != Op->getOperand(i).getOpcode()) | |||
| 10829 | return SDValue(); | |||
| 10830 | ||||
| 10831 | // TODO: We may be able to add support for other Ops (ADD/SUB + shifts). | |||
| 10832 | bool IsShift = false; | |||
| 10833 | switch (Opcode) { | |||
| 10834 | default: | |||
| 10835 | return SDValue(); | |||
| 10836 | case ISD::SHL: | |||
| 10837 | case ISD::SRL: | |||
| 10838 | case ISD::SRA: | |||
| 10839 | IsShift = true; | |||
| 10840 | break; | |||
| 10841 | case ISD::AND: | |||
| 10842 | case ISD::XOR: | |||
| 10843 | case ISD::OR: | |||
| 10844 | // Don't do this if the buildvector is a splat - we'd replace one | |||
| 10845 | // constant with an entire vector. | |||
| 10846 | if (Op->getSplatValue()) | |||
| 10847 | return SDValue(); | |||
| 10848 | if (!TLI.isOperationLegalOrPromote(Opcode, VT)) | |||
| 10849 | return SDValue(); | |||
| 10850 | break; | |||
| 10851 | } | |||
| 10852 | ||||
| 10853 | SmallVector<SDValue, 4> LHSElts, RHSElts; | |||
| 10854 | for (SDValue Elt : Op->ops()) { | |||
| 10855 | SDValue LHS = Elt.getOperand(0); | |||
| 10856 | SDValue RHS = Elt.getOperand(1); | |||
| 10857 | ||||
| 10858 | // We expect the canonicalized RHS operand to be the constant. | |||
| 10859 | if (!isa<ConstantSDNode>(RHS)) | |||
| 10860 | return SDValue(); | |||
| 10861 | ||||
| 10862 | // Extend shift amounts. | |||
| 10863 | if (RHS.getValueSizeInBits() != VT.getScalarSizeInBits()) { | |||
| 10864 | if (!IsShift) | |||
| 10865 | return SDValue(); | |||
| 10866 | RHS = DAG.getZExtOrTrunc(RHS, DL, VT.getScalarType()); | |||
| 10867 | } | |||
| 10868 | ||||
| 10869 | LHSElts.push_back(LHS); | |||
| 10870 | RHSElts.push_back(RHS); | |||
| 10871 | } | |||
| 10872 | ||||
| 10873 | // Limit to shifts by uniform immediates. | |||
| 10874 | // TODO: Only accept vXi8/vXi64 special cases? | |||
| 10875 | // TODO: Permit non-uniform XOP/AVX2/MULLO cases? | |||
| 10876 | if (IsShift && any_of(RHSElts, [&](SDValue V) { return RHSElts[0] != V; })) | |||
| 10877 | return SDValue(); | |||
| 10878 | ||||
| 10879 | SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts); | |||
| 10880 | SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts); | |||
| 10881 | SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); | |||
| 10882 | ||||
| 10883 | if (!IsShift) | |||
| 10884 | return Res; | |||
| 10885 | ||||
| 10886 | // Immediately lower the shift to ensure the constant build vector doesn't | |||
| 10887 | // get converted to a constant pool before the shift is lowered. | |||
| 10888 | return LowerShift(Res, Subtarget, DAG); | |||
| 10889 | } | |||
| 10890 | ||||
| 10891 | /// Create a vector constant without a load. SSE/AVX provide the bare minimum | |||
| 10892 | /// functionality to do this, so it's all zeros, all ones, or some derivation | |||
| 10893 | /// that is cheap to calculate. | |||
| 10894 | static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, | |||
| 10895 | const X86Subtarget &Subtarget) { | |||
| 10896 | SDLoc DL(Op); | |||
| 10897 | MVT VT = Op.getSimpleValueType(); | |||
| 10898 | ||||
| 10899 | // Vectors containing all zeros can be matched by pxor and xorps. | |||
| 10900 | if (ISD::isBuildVectorAllZeros(Op.getNode())) | |||
| 10901 | return Op; | |||
| 10902 | ||||
| 10903 | // Vectors containing all ones can be matched by pcmpeqd on 128-bit width | |||
| 10904 | // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use | |||
| 10905 | // vpcmpeqd on 256-bit vectors. | |||
| 10906 | if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { | |||
| 10907 | if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) | |||
| 10908 | return Op; | |||
| 10909 | ||||
| 10910 | return getOnesVector(VT, DAG, DL); | |||
| 10911 | } | |||
| 10912 | ||||
| 10913 | return SDValue(); | |||
| 10914 | } | |||
| 10915 | ||||
| 10916 | /// Look for opportunities to create a VPERMV/VPERMILPV/PSHUFB variable permute | |||
| 10917 | /// from a vector of source values and a vector of extraction indices. | |||
| 10918 | /// The vectors might be manipulated to match the type of the permute op. | |||
| 10919 | static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec, | |||
| 10920 | SDLoc &DL, SelectionDAG &DAG, | |||
| 10921 | const X86Subtarget &Subtarget) { | |||
| 10922 | MVT ShuffleVT = VT; | |||
| 10923 | EVT IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); | |||
| 10924 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 10925 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 10926 | ||||
| 10927 | // Adjust IndicesVec to match VT size. | |||
| 10928 | assert(IndicesVec.getValueType().getVectorNumElements() >= NumElts &&(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements () >= NumElts && "Illegal variable permute mask size" ) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__ __PRETTY_FUNCTION__)) | |||
| 10929 | "Illegal variable permute mask size")(static_cast <bool> (IndicesVec.getValueType().getVectorNumElements () >= NumElts && "Illegal variable permute mask size" ) ? void (0) : __assert_fail ("IndicesVec.getValueType().getVectorNumElements() >= NumElts && \"Illegal variable permute mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10929, __extension__ __PRETTY_FUNCTION__)); | |||
| 10930 | if (IndicesVec.getValueType().getVectorNumElements() > NumElts) { | |||
| 10931 | // Narrow/widen the indices vector to the correct size. | |||
| 10932 | if (IndicesVec.getValueSizeInBits() > SizeInBits) | |||
| 10933 | IndicesVec = extractSubVector(IndicesVec, 0, DAG, SDLoc(IndicesVec), | |||
| 10934 | NumElts * VT.getScalarSizeInBits()); | |||
| 10935 | else if (IndicesVec.getValueSizeInBits() < SizeInBits) | |||
| 10936 | IndicesVec = widenSubVector(IndicesVec, false, Subtarget, DAG, | |||
| 10937 | SDLoc(IndicesVec), SizeInBits); | |||
| 10938 | // Zero-extend the index elements within the vector. | |||
| 10939 | if (IndicesVec.getValueType().getVectorNumElements() > NumElts) | |||
| 10940 | IndicesVec = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(IndicesVec), | |||
| 10941 | IndicesVT, IndicesVec); | |||
| 10942 | } | |||
| 10943 | IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); | |||
| 10944 | ||||
| 10945 | // Handle SrcVec that don't match VT type. | |||
| 10946 | if (SrcVec.getValueSizeInBits() != SizeInBits) { | |||
| 10947 | if ((SrcVec.getValueSizeInBits() % SizeInBits) == 0) { | |||
| 10948 | // Handle larger SrcVec by treating it as a larger permute. | |||
| 10949 | unsigned Scale = SrcVec.getValueSizeInBits() / SizeInBits; | |||
| 10950 | VT = MVT::getVectorVT(VT.getScalarType(), Scale * NumElts); | |||
| 10951 | IndicesVT = EVT(VT).changeVectorElementTypeToInteger(); | |||
| 10952 | IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false, | |||
| 10953 | Subtarget, DAG, SDLoc(IndicesVec)); | |||
| 10954 | SDValue NewSrcVec = | |||
| 10955 | createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); | |||
| 10956 | if (NewSrcVec) | |||
| 10957 | return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits); | |||
| 10958 | return SDValue(); | |||
| 10959 | } else if (SrcVec.getValueSizeInBits() < SizeInBits) { | |||
| 10960 | // Widen smaller SrcVec to match VT. | |||
| 10961 | SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec)); | |||
| 10962 | } else | |||
| 10963 | return SDValue(); | |||
| 10964 | } | |||
| 10965 | ||||
| 10966 | auto ScaleIndices = [&DAG](SDValue Idx, uint64_t Scale) { | |||
| 10967 | assert(isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale")(static_cast <bool> (isPowerOf2_64(Scale) && "Illegal variable permute shuffle scale" ) ? void (0) : __assert_fail ("isPowerOf2_64(Scale) && \"Illegal variable permute shuffle scale\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 10967, __extension__ __PRETTY_FUNCTION__)); | |||
| 10968 | EVT SrcVT = Idx.getValueType(); | |||
| 10969 | unsigned NumDstBits = SrcVT.getScalarSizeInBits() / Scale; | |||
| 10970 | uint64_t IndexScale = 0; | |||
| 10971 | uint64_t IndexOffset = 0; | |||
| 10972 | ||||
| 10973 | // If we're scaling a smaller permute op, then we need to repeat the | |||
| 10974 | // indices, scaling and offsetting them as well. | |||
| 10975 | // e.g. v4i32 -> v16i8 (Scale = 4) | |||
| 10976 | // IndexScale = v4i32 Splat(4 << 24 | 4 << 16 | 4 << 8 | 4) | |||
| 10977 | // IndexOffset = v4i32 Splat(3 << 24 | 2 << 16 | 1 << 8 | 0) | |||
| 10978 | for (uint64_t i = 0; i != Scale; ++i) { | |||
| 10979 | IndexScale |= Scale << (i * NumDstBits); | |||
| 10980 | IndexOffset |= i << (i * NumDstBits); | |||
| 10981 | } | |||
| 10982 | ||||
| 10983 | Idx = DAG.getNode(ISD::MUL, SDLoc(Idx), SrcVT, Idx, | |||
| 10984 | DAG.getConstant(IndexScale, SDLoc(Idx), SrcVT)); | |||
| 10985 | Idx = DAG.getNode(ISD::ADD, SDLoc(Idx), SrcVT, Idx, | |||
| 10986 | DAG.getConstant(IndexOffset, SDLoc(Idx), SrcVT)); | |||
| 10987 | return Idx; | |||
| 10988 | }; | |||
| 10989 | ||||
| 10990 | unsigned Opcode = 0; | |||
| 10991 | switch (VT.SimpleTy) { | |||
| 10992 | default: | |||
| 10993 | break; | |||
| 10994 | case MVT::v16i8: | |||
| 10995 | if (Subtarget.hasSSSE3()) | |||
| 10996 | Opcode = X86ISD::PSHUFB; | |||
| 10997 | break; | |||
| 10998 | case MVT::v8i16: | |||
| 10999 | if (Subtarget.hasVLX() && Subtarget.hasBWI()) | |||
| 11000 | Opcode = X86ISD::VPERMV; | |||
| 11001 | else if (Subtarget.hasSSSE3()) { | |||
| 11002 | Opcode = X86ISD::PSHUFB; | |||
| 11003 | ShuffleVT = MVT::v16i8; | |||
| 11004 | } | |||
| 11005 | break; | |||
| 11006 | case MVT::v4f32: | |||
| 11007 | case MVT::v4i32: | |||
| 11008 | if (Subtarget.hasAVX()) { | |||
| 11009 | Opcode = X86ISD::VPERMILPV; | |||
| 11010 | ShuffleVT = MVT::v4f32; | |||
| 11011 | } else if (Subtarget.hasSSSE3()) { | |||
| 11012 | Opcode = X86ISD::PSHUFB; | |||
| 11013 | ShuffleVT = MVT::v16i8; | |||
| 11014 | } | |||
| 11015 | break; | |||
| 11016 | case MVT::v2f64: | |||
| 11017 | case MVT::v2i64: | |||
| 11018 | if (Subtarget.hasAVX()) { | |||
| 11019 | // VPERMILPD selects using bit#1 of the index vector, so scale IndicesVec. | |||
| 11020 | IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); | |||
| 11021 | Opcode = X86ISD::VPERMILPV; | |||
| 11022 | ShuffleVT = MVT::v2f64; | |||
| 11023 | } else if (Subtarget.hasSSE41()) { | |||
| 11024 | // SSE41 can compare v2i64 - select between indices 0 and 1. | |||
| 11025 | return DAG.getSelectCC( | |||
| 11026 | DL, IndicesVec, | |||
| 11027 | getZeroVector(IndicesVT.getSimpleVT(), Subtarget, DAG, DL), | |||
| 11028 | DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {0, 0}), | |||
| 11029 | DAG.getVectorShuffle(VT, DL, SrcVec, SrcVec, {1, 1}), | |||
| 11030 | ISD::CondCode::SETEQ); | |||
| 11031 | } | |||
| 11032 | break; | |||
| 11033 | case MVT::v32i8: | |||
| 11034 | if (Subtarget.hasVLX() && Subtarget.hasVBMI()) | |||
| 11035 | Opcode = X86ISD::VPERMV; | |||
| 11036 | else if (Subtarget.hasXOP()) { | |||
| 11037 | SDValue LoSrc = extract128BitVector(SrcVec, 0, DAG, DL); | |||
| 11038 | SDValue HiSrc = extract128BitVector(SrcVec, 16, DAG, DL); | |||
| 11039 | SDValue LoIdx = extract128BitVector(IndicesVec, 0, DAG, DL); | |||
| 11040 | SDValue HiIdx = extract128BitVector(IndicesVec, 16, DAG, DL); | |||
| 11041 | return DAG.getNode( | |||
| 11042 | ISD::CONCAT_VECTORS, DL, VT, | |||
| 11043 | DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, LoIdx), | |||
| 11044 | DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, LoSrc, HiSrc, HiIdx)); | |||
| 11045 | } else if (Subtarget.hasAVX()) { | |||
| 11046 | SDValue Lo = extract128BitVector(SrcVec, 0, DAG, DL); | |||
| 11047 | SDValue Hi = extract128BitVector(SrcVec, 16, DAG, DL); | |||
| 11048 | SDValue LoLo = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Lo); | |||
| 11049 | SDValue HiHi = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Hi, Hi); | |||
| 11050 | auto PSHUFBBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 11051 | ArrayRef<SDValue> Ops) { | |||
| 11052 | // Permute Lo and Hi and then select based on index range. | |||
| 11053 | // This works as SHUFB uses bits[3:0] to permute elements and we don't | |||
| 11054 | // care about the bit[7] as its just an index vector. | |||
| 11055 | SDValue Idx = Ops[2]; | |||
| 11056 | EVT VT = Idx.getValueType(); | |||
| 11057 | return DAG.getSelectCC(DL, Idx, DAG.getConstant(15, DL, VT), | |||
| 11058 | DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[1], Idx), | |||
| 11059 | DAG.getNode(X86ISD::PSHUFB, DL, VT, Ops[0], Idx), | |||
| 11060 | ISD::CondCode::SETGT); | |||
| 11061 | }; | |||
| 11062 | SDValue Ops[] = {LoLo, HiHi, IndicesVec}; | |||
| 11063 | return SplitOpsAndApply(DAG, Subtarget, DL, MVT::v32i8, Ops, | |||
| 11064 | PSHUFBBuilder); | |||
| 11065 | } | |||
| 11066 | break; | |||
| 11067 | case MVT::v16i16: | |||
| 11068 | if (Subtarget.hasVLX() && Subtarget.hasBWI()) | |||
| 11069 | Opcode = X86ISD::VPERMV; | |||
| 11070 | else if (Subtarget.hasAVX()) { | |||
| 11071 | // Scale to v32i8 and perform as v32i8. | |||
| 11072 | IndicesVec = ScaleIndices(IndicesVec, 2); | |||
| 11073 | return DAG.getBitcast( | |||
| 11074 | VT, createVariablePermute( | |||
| 11075 | MVT::v32i8, DAG.getBitcast(MVT::v32i8, SrcVec), | |||
| 11076 | DAG.getBitcast(MVT::v32i8, IndicesVec), DL, DAG, Subtarget)); | |||
| 11077 | } | |||
| 11078 | break; | |||
| 11079 | case MVT::v8f32: | |||
| 11080 | case MVT::v8i32: | |||
| 11081 | if (Subtarget.hasAVX2()) | |||
| 11082 | Opcode = X86ISD::VPERMV; | |||
| 11083 | else if (Subtarget.hasAVX()) { | |||
| 11084 | SrcVec = DAG.getBitcast(MVT::v8f32, SrcVec); | |||
| 11085 | SDValue LoLo = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, | |||
| 11086 | {0, 1, 2, 3, 0, 1, 2, 3}); | |||
| 11087 | SDValue HiHi = DAG.getVectorShuffle(MVT::v8f32, DL, SrcVec, SrcVec, | |||
| 11088 | {4, 5, 6, 7, 4, 5, 6, 7}); | |||
| 11089 | if (Subtarget.hasXOP()) | |||
| 11090 | return DAG.getBitcast( | |||
| 11091 | VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v8f32, LoLo, HiHi, | |||
| 11092 | IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); | |||
| 11093 | // Permute Lo and Hi and then select based on index range. | |||
| 11094 | // This works as VPERMILPS only uses index bits[0:1] to permute elements. | |||
| 11095 | SDValue Res = DAG.getSelectCC( | |||
| 11096 | DL, IndicesVec, DAG.getConstant(3, DL, MVT::v8i32), | |||
| 11097 | DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, HiHi, IndicesVec), | |||
| 11098 | DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, LoLo, IndicesVec), | |||
| 11099 | ISD::CondCode::SETGT); | |||
| 11100 | return DAG.getBitcast(VT, Res); | |||
| 11101 | } | |||
| 11102 | break; | |||
| 11103 | case MVT::v4i64: | |||
| 11104 | case MVT::v4f64: | |||
| 11105 | if (Subtarget.hasAVX512()) { | |||
| 11106 | if (!Subtarget.hasVLX()) { | |||
| 11107 | MVT WidenSrcVT = MVT::getVectorVT(VT.getScalarType(), 8); | |||
| 11108 | SrcVec = widenSubVector(WidenSrcVT, SrcVec, false, Subtarget, DAG, | |||
| 11109 | SDLoc(SrcVec)); | |||
| 11110 | IndicesVec = widenSubVector(MVT::v8i64, IndicesVec, false, Subtarget, | |||
| 11111 | DAG, SDLoc(IndicesVec)); | |||
| 11112 | SDValue Res = createVariablePermute(WidenSrcVT, SrcVec, IndicesVec, DL, | |||
| 11113 | DAG, Subtarget); | |||
| 11114 | return extract256BitVector(Res, 0, DAG, DL); | |||
| 11115 | } | |||
| 11116 | Opcode = X86ISD::VPERMV; | |||
| 11117 | } else if (Subtarget.hasAVX()) { | |||
| 11118 | SrcVec = DAG.getBitcast(MVT::v4f64, SrcVec); | |||
| 11119 | SDValue LoLo = | |||
| 11120 | DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {0, 1, 0, 1}); | |||
| 11121 | SDValue HiHi = | |||
| 11122 | DAG.getVectorShuffle(MVT::v4f64, DL, SrcVec, SrcVec, {2, 3, 2, 3}); | |||
| 11123 | // VPERMIL2PD selects with bit#1 of the index vector, so scale IndicesVec. | |||
| 11124 | IndicesVec = DAG.getNode(ISD::ADD, DL, IndicesVT, IndicesVec, IndicesVec); | |||
| 11125 | if (Subtarget.hasXOP()) | |||
| 11126 | return DAG.getBitcast( | |||
| 11127 | VT, DAG.getNode(X86ISD::VPERMIL2, DL, MVT::v4f64, LoLo, HiHi, | |||
| 11128 | IndicesVec, DAG.getTargetConstant(0, DL, MVT::i8))); | |||
| 11129 | // Permute Lo and Hi and then select based on index range. | |||
| 11130 | // This works as VPERMILPD only uses index bit[1] to permute elements. | |||
| 11131 | SDValue Res = DAG.getSelectCC( | |||
| 11132 | DL, IndicesVec, DAG.getConstant(2, DL, MVT::v4i64), | |||
| 11133 | DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, HiHi, IndicesVec), | |||
| 11134 | DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v4f64, LoLo, IndicesVec), | |||
| 11135 | ISD::CondCode::SETGT); | |||
| 11136 | return DAG.getBitcast(VT, Res); | |||
| 11137 | } | |||
| 11138 | break; | |||
| 11139 | case MVT::v64i8: | |||
| 11140 | if (Subtarget.hasVBMI()) | |||
| 11141 | Opcode = X86ISD::VPERMV; | |||
| 11142 | break; | |||
| 11143 | case MVT::v32i16: | |||
| 11144 | if (Subtarget.hasBWI()) | |||
| 11145 | Opcode = X86ISD::VPERMV; | |||
| 11146 | break; | |||
| 11147 | case MVT::v16f32: | |||
| 11148 | case MVT::v16i32: | |||
| 11149 | case MVT::v8f64: | |||
| 11150 | case MVT::v8i64: | |||
| 11151 | if (Subtarget.hasAVX512()) | |||
| 11152 | Opcode = X86ISD::VPERMV; | |||
| 11153 | break; | |||
| 11154 | } | |||
| 11155 | if (!Opcode) | |||
| 11156 | return SDValue(); | |||
| 11157 | ||||
| 11158 | assert((VT.getSizeInBits() == ShuffleVT.getSizeInBits()) &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits ()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits ()) == 0 && "Illegal variable permute shuffle type") ? void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__ __PRETTY_FUNCTION__)) | |||
| 11159 | (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 &&(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits ()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits ()) == 0 && "Illegal variable permute shuffle type") ? void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__ __PRETTY_FUNCTION__)) | |||
| 11160 | "Illegal variable permute shuffle type")(static_cast <bool> ((VT.getSizeInBits() == ShuffleVT.getSizeInBits ()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits ()) == 0 && "Illegal variable permute shuffle type") ? void (0) : __assert_fail ("(VT.getSizeInBits() == ShuffleVT.getSizeInBits()) && (VT.getScalarSizeInBits() % ShuffleVT.getScalarSizeInBits()) == 0 && \"Illegal variable permute shuffle type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11160, __extension__ __PRETTY_FUNCTION__)); | |||
| 11161 | ||||
| 11162 | uint64_t Scale = VT.getScalarSizeInBits() / ShuffleVT.getScalarSizeInBits(); | |||
| 11163 | if (Scale > 1) | |||
| 11164 | IndicesVec = ScaleIndices(IndicesVec, Scale); | |||
| 11165 | ||||
| 11166 | EVT ShuffleIdxVT = EVT(ShuffleVT).changeVectorElementTypeToInteger(); | |||
| 11167 | IndicesVec = DAG.getBitcast(ShuffleIdxVT, IndicesVec); | |||
| 11168 | ||||
| 11169 | SrcVec = DAG.getBitcast(ShuffleVT, SrcVec); | |||
| 11170 | SDValue Res = Opcode == X86ISD::VPERMV | |||
| 11171 | ? DAG.getNode(Opcode, DL, ShuffleVT, IndicesVec, SrcVec) | |||
| 11172 | : DAG.getNode(Opcode, DL, ShuffleVT, SrcVec, IndicesVec); | |||
| 11173 | return DAG.getBitcast(VT, Res); | |||
| 11174 | } | |||
| 11175 | ||||
| 11176 | // Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be | |||
| 11177 | // reasoned to be a permutation of a vector by indices in a non-constant vector. | |||
| 11178 | // (build_vector (extract_elt V, (extract_elt I, 0)), | |||
| 11179 | // (extract_elt V, (extract_elt I, 1)), | |||
| 11180 | // ... | |||
| 11181 | // -> | |||
| 11182 | // (vpermv I, V) | |||
| 11183 | // | |||
| 11184 | // TODO: Handle undefs | |||
| 11185 | // TODO: Utilize pshufb and zero mask blending to support more efficient | |||
| 11186 | // construction of vectors with constant-0 elements. | |||
| 11187 | static SDValue | |||
| 11188 | LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, | |||
| 11189 | const X86Subtarget &Subtarget) { | |||
| 11190 | SDValue SrcVec, IndicesVec; | |||
| 11191 | // Check for a match of the permute source vector and permute index elements. | |||
| 11192 | // This is done by checking that the i-th build_vector operand is of the form: | |||
| 11193 | // (extract_elt SrcVec, (extract_elt IndicesVec, i)). | |||
| 11194 | for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { | |||
| 11195 | SDValue Op = V.getOperand(Idx); | |||
| 11196 | if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 11197 | return SDValue(); | |||
| 11198 | ||||
| 11199 | // If this is the first extract encountered in V, set the source vector, | |||
| 11200 | // otherwise verify the extract is from the previously defined source | |||
| 11201 | // vector. | |||
| 11202 | if (!SrcVec) | |||
| 11203 | SrcVec = Op.getOperand(0); | |||
| 11204 | else if (SrcVec != Op.getOperand(0)) | |||
| 11205 | return SDValue(); | |||
| 11206 | SDValue ExtractedIndex = Op->getOperand(1); | |||
| 11207 | // Peek through extends. | |||
| 11208 | if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || | |||
| 11209 | ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) | |||
| 11210 | ExtractedIndex = ExtractedIndex.getOperand(0); | |||
| 11211 | if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 11212 | return SDValue(); | |||
| 11213 | ||||
| 11214 | // If this is the first extract from the index vector candidate, set the | |||
| 11215 | // indices vector, otherwise verify the extract is from the previously | |||
| 11216 | // defined indices vector. | |||
| 11217 | if (!IndicesVec) | |||
| 11218 | IndicesVec = ExtractedIndex.getOperand(0); | |||
| 11219 | else if (IndicesVec != ExtractedIndex.getOperand(0)) | |||
| 11220 | return SDValue(); | |||
| 11221 | ||||
| 11222 | auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); | |||
| 11223 | if (!PermIdx || PermIdx->getAPIntValue() != Idx) | |||
| 11224 | return SDValue(); | |||
| 11225 | } | |||
| 11226 | ||||
| 11227 | SDLoc DL(V); | |||
| 11228 | MVT VT = V.getSimpleValueType(); | |||
| 11229 | return createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget); | |||
| 11230 | } | |||
| 11231 | ||||
| 11232 | SDValue | |||
| 11233 | X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { | |||
| 11234 | SDLoc dl(Op); | |||
| 11235 | ||||
| 11236 | MVT VT = Op.getSimpleValueType(); | |||
| 11237 | MVT EltVT = VT.getVectorElementType(); | |||
| 11238 | MVT OpEltVT = Op.getOperand(0).getSimpleValueType(); | |||
| 11239 | unsigned NumElems = Op.getNumOperands(); | |||
| 11240 | ||||
| 11241 | // Generate vectors for predicate vectors. | |||
| 11242 | if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512()) | |||
| 11243 | return LowerBUILD_VECTORvXi1(Op, DAG, Subtarget); | |||
| 11244 | ||||
| 11245 | if (VT.getVectorElementType() == MVT::bf16 && Subtarget.hasBF16()) | |||
| 11246 | return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget); | |||
| 11247 | ||||
| 11248 | if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget)) | |||
| 11249 | return VectorConstant; | |||
| 11250 | ||||
| 11251 | unsigned EVTBits = EltVT.getSizeInBits(); | |||
| 11252 | APInt UndefMask = APInt::getZero(NumElems); | |||
| 11253 | APInt FrozenUndefMask = APInt::getZero(NumElems); | |||
| 11254 | APInt ZeroMask = APInt::getZero(NumElems); | |||
| 11255 | APInt NonZeroMask = APInt::getZero(NumElems); | |||
| 11256 | bool IsAllConstants = true; | |||
| 11257 | bool OneUseFrozenUndefs = true; | |||
| 11258 | SmallSet<SDValue, 8> Values; | |||
| 11259 | unsigned NumConstants = NumElems; | |||
| 11260 | for (unsigned i = 0; i < NumElems; ++i) { | |||
| 11261 | SDValue Elt = Op.getOperand(i); | |||
| 11262 | if (Elt.isUndef()) { | |||
| 11263 | UndefMask.setBit(i); | |||
| 11264 | continue; | |||
| 11265 | } | |||
| 11266 | if (ISD::isFreezeUndef(Elt.getNode())) { | |||
| 11267 | OneUseFrozenUndefs = OneUseFrozenUndefs && Elt->hasOneUse(); | |||
| 11268 | FrozenUndefMask.setBit(i); | |||
| 11269 | continue; | |||
| 11270 | } | |||
| 11271 | Values.insert(Elt); | |||
| 11272 | if (!isIntOrFPConstant(Elt)) { | |||
| 11273 | IsAllConstants = false; | |||
| 11274 | NumConstants--; | |||
| 11275 | } | |||
| 11276 | if (X86::isZeroNode(Elt)) { | |||
| 11277 | ZeroMask.setBit(i); | |||
| 11278 | } else { | |||
| 11279 | NonZeroMask.setBit(i); | |||
| 11280 | } | |||
| 11281 | } | |||
| 11282 | ||||
| 11283 | // All undef vector. Return an UNDEF. | |||
| 11284 | if (UndefMask.isAllOnes()) | |||
| 11285 | return DAG.getUNDEF(VT); | |||
| 11286 | ||||
| 11287 | // All undef/freeze(undef) vector. Return a FREEZE UNDEF. | |||
| 11288 | if (OneUseFrozenUndefs && (UndefMask | FrozenUndefMask).isAllOnes()) | |||
| 11289 | return DAG.getFreeze(DAG.getUNDEF(VT)); | |||
| 11290 | ||||
| 11291 | // All undef/freeze(undef)/zero vector. Return a zero vector. | |||
| 11292 | if ((UndefMask | FrozenUndefMask | ZeroMask).isAllOnes()) | |||
| 11293 | return getZeroVector(VT, Subtarget, DAG, dl); | |||
| 11294 | ||||
| 11295 | // If we have multiple FREEZE-UNDEF operands, we are likely going to end up | |||
| 11296 | // lowering into a suboptimal insertion sequence. Instead, thaw the UNDEF in | |||
| 11297 | // our source BUILD_VECTOR, create another FREEZE-UNDEF splat BUILD_VECTOR, | |||
| 11298 | // and blend the FREEZE-UNDEF operands back in. | |||
| 11299 | // FIXME: is this worthwhile even for a single FREEZE-UNDEF operand? | |||
| 11300 | if (unsigned NumFrozenUndefElts = FrozenUndefMask.popcount(); | |||
| 11301 | NumFrozenUndefElts >= 2 && NumFrozenUndefElts < NumElems) { | |||
| 11302 | SmallVector<int, 16> BlendMask(NumElems, -1); | |||
| 11303 | SmallVector<SDValue, 16> Elts(NumElems, DAG.getUNDEF(OpEltVT)); | |||
| 11304 | for (unsigned i = 0; i < NumElems; ++i) { | |||
| 11305 | if (UndefMask[i]) { | |||
| 11306 | BlendMask[i] = -1; | |||
| 11307 | continue; | |||
| 11308 | } | |||
| 11309 | BlendMask[i] = i; | |||
| 11310 | if (!FrozenUndefMask[i]) | |||
| 11311 | Elts[i] = Op.getOperand(i); | |||
| 11312 | else | |||
| 11313 | BlendMask[i] += NumElems; | |||
| 11314 | } | |||
| 11315 | SDValue EltsBV = DAG.getBuildVector(VT, dl, Elts); | |||
| 11316 | SDValue FrozenUndefElt = DAG.getFreeze(DAG.getUNDEF(OpEltVT)); | |||
| 11317 | SDValue FrozenUndefBV = DAG.getSplatBuildVector(VT, dl, FrozenUndefElt); | |||
| 11318 | return DAG.getVectorShuffle(VT, dl, EltsBV, FrozenUndefBV, BlendMask); | |||
| 11319 | } | |||
| 11320 | ||||
| 11321 | BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); | |||
| 11322 | ||||
| 11323 | // If the upper elts of a ymm/zmm are undef/freeze(undef)/zero then we might | |||
| 11324 | // be better off lowering to a smaller build vector and padding with | |||
| 11325 | // undef/zero. | |||
| 11326 | if ((VT.is256BitVector() || VT.is512BitVector()) && | |||
| 11327 | !isFoldableUseOfShuffle(BV)) { | |||
| 11328 | unsigned UpperElems = NumElems / 2; | |||
| 11329 | APInt UndefOrZeroMask = FrozenUndefMask | UndefMask | ZeroMask; | |||
| 11330 | unsigned NumUpperUndefsOrZeros = UndefOrZeroMask.countl_one(); | |||
| 11331 | if (NumUpperUndefsOrZeros >= UpperElems) { | |||
| 11332 | if (VT.is512BitVector() && | |||
| 11333 | NumUpperUndefsOrZeros >= (NumElems - (NumElems / 4))) | |||
| 11334 | UpperElems = NumElems - (NumElems / 4); | |||
| 11335 | // If freeze(undef) is in any upper elements, force to zero. | |||
| 11336 | bool UndefUpper = UndefMask.countl_one() >= UpperElems; | |||
| 11337 | MVT LowerVT = MVT::getVectorVT(EltVT, NumElems - UpperElems); | |||
| 11338 | SDValue NewBV = | |||
| 11339 | DAG.getBuildVector(LowerVT, dl, Op->ops().drop_back(UpperElems)); | |||
| 11340 | return widenSubVector(VT, NewBV, !UndefUpper, Subtarget, DAG, dl); | |||
| 11341 | } | |||
| 11342 | } | |||
| 11343 | ||||
| 11344 | if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) | |||
| 11345 | return AddSub; | |||
| 11346 | if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) | |||
| 11347 | return HorizontalOp; | |||
| 11348 | if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG)) | |||
| 11349 | return Broadcast; | |||
| 11350 | if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG)) | |||
| 11351 | return BitOp; | |||
| 11352 | ||||
| 11353 | unsigned NumZero = ZeroMask.popcount(); | |||
| 11354 | unsigned NumNonZero = NonZeroMask.popcount(); | |||
| 11355 | ||||
| 11356 | // If we are inserting one variable into a vector of non-zero constants, try | |||
| 11357 | // to avoid loading each constant element as a scalar. Load the constants as a | |||
| 11358 | // vector and then insert the variable scalar element. If insertion is not | |||
| 11359 | // supported, fall back to a shuffle to get the scalar blended with the | |||
| 11360 | // constants. Insertion into a zero vector is handled as a special-case | |||
| 11361 | // somewhere below here. | |||
| 11362 | if (NumConstants == NumElems - 1 && NumNonZero != 1 && | |||
| 11363 | (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || | |||
| 11364 | isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { | |||
| 11365 | // Create an all-constant vector. The variable element in the old | |||
| 11366 | // build vector is replaced by undef in the constant vector. Save the | |||
| 11367 | // variable scalar element and its index for use in the insertelement. | |||
| 11368 | LLVMContext &Context = *DAG.getContext(); | |||
| 11369 | Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); | |||
| 11370 | SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType)); | |||
| 11371 | SDValue VarElt; | |||
| 11372 | SDValue InsIndex; | |||
| 11373 | for (unsigned i = 0; i != NumElems; ++i) { | |||
| 11374 | SDValue Elt = Op.getOperand(i); | |||
| 11375 | if (auto *C = dyn_cast<ConstantSDNode>(Elt)) | |||
| 11376 | ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); | |||
| 11377 | else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt)) | |||
| 11378 | ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); | |||
| 11379 | else if (!Elt.isUndef()) { | |||
| 11380 | assert(!VarElt.getNode() && !InsIndex.getNode() &&(static_cast <bool> (!VarElt.getNode() && !InsIndex .getNode() && "Expected one variable element in this vector" ) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__ __PRETTY_FUNCTION__)) | |||
| 11381 | "Expected one variable element in this vector")(static_cast <bool> (!VarElt.getNode() && !InsIndex .getNode() && "Expected one variable element in this vector" ) ? void (0) : __assert_fail ("!VarElt.getNode() && !InsIndex.getNode() && \"Expected one variable element in this vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11381, __extension__ __PRETTY_FUNCTION__)); | |||
| 11382 | VarElt = Elt; | |||
| 11383 | InsIndex = DAG.getVectorIdxConstant(i, dl); | |||
| 11384 | } | |||
| 11385 | } | |||
| 11386 | Constant *CV = ConstantVector::get(ConstVecOps); | |||
| 11387 | SDValue DAGConstVec = DAG.getConstantPool(CV, VT); | |||
| 11388 | ||||
| 11389 | // The constants we just created may not be legal (eg, floating point). We | |||
| 11390 | // must lower the vector right here because we can not guarantee that we'll | |||
| 11391 | // legalize it before loading it. This is also why we could not just create | |||
| 11392 | // a new build vector here. If the build vector contains illegal constants, | |||
| 11393 | // it could get split back up into a series of insert elements. | |||
| 11394 | // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. | |||
| 11395 | SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); | |||
| 11396 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 11397 | MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); | |||
| 11398 | SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); | |||
| 11399 | unsigned InsertC = cast<ConstantSDNode>(InsIndex)->getZExtValue(); | |||
| 11400 | unsigned NumEltsInLow128Bits = 128 / VT.getScalarSizeInBits(); | |||
| 11401 | if (InsertC < NumEltsInLow128Bits) | |||
| 11402 | return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); | |||
| 11403 | ||||
| 11404 | // There's no good way to insert into the high elements of a >128-bit | |||
| 11405 | // vector, so use shuffles to avoid an extract/insert sequence. | |||
| 11406 | assert(VT.getSizeInBits() > 128 && "Invalid insertion index?")(static_cast <bool> (VT.getSizeInBits() > 128 && "Invalid insertion index?") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Invalid insertion index?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11406, __extension__ __PRETTY_FUNCTION__)); | |||
| 11407 | assert(Subtarget.hasAVX() && "Must have AVX with >16-byte vector")(static_cast <bool> (Subtarget.hasAVX() && "Must have AVX with >16-byte vector" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Must have AVX with >16-byte vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11407, __extension__ __PRETTY_FUNCTION__)); | |||
| 11408 | SmallVector<int, 8> ShuffleMask; | |||
| 11409 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 11410 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 11411 | ShuffleMask.push_back(i == InsertC ? NumElts : i); | |||
| 11412 | SDValue S2V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, VarElt); | |||
| 11413 | return DAG.getVectorShuffle(VT, dl, Ld, S2V, ShuffleMask); | |||
| 11414 | } | |||
| 11415 | ||||
| 11416 | // Special case for single non-zero, non-undef, element. | |||
| 11417 | if (NumNonZero == 1) { | |||
| 11418 | unsigned Idx = NonZeroMask.countr_zero(); | |||
| 11419 | SDValue Item = Op.getOperand(Idx); | |||
| 11420 | ||||
| 11421 | // If we have a constant or non-constant insertion into the low element of | |||
| 11422 | // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into | |||
| 11423 | // the rest of the elements. This will be matched as movd/movq/movss/movsd | |||
| 11424 | // depending on what the source datatype is. | |||
| 11425 | if (Idx == 0) { | |||
| 11426 | if (NumZero == 0) | |||
| 11427 | return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); | |||
| 11428 | ||||
| 11429 | if (EltVT == MVT::i32 || EltVT == MVT::f16 || EltVT == MVT::f32 || | |||
| 11430 | EltVT == MVT::f64 || (EltVT == MVT::i64 && Subtarget.is64Bit()) || | |||
| 11431 | (EltVT == MVT::i16 && Subtarget.hasFP16())) { | |||
| 11432 | assert((VT.is128BitVector() || VT.is256BitVector() ||(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Expected an SSE value type!" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__ __PRETTY_FUNCTION__)) | |||
| 11433 | VT.is512BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Expected an SSE value type!" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__ __PRETTY_FUNCTION__)) | |||
| 11434 | "Expected an SSE value type!")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector () || VT.is512BitVector()) && "Expected an SSE value type!" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && \"Expected an SSE value type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11434, __extension__ __PRETTY_FUNCTION__)); | |||
| 11435 | Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); | |||
| 11436 | // Turn it into a MOVL (i.e. movsh, movss, movsd, movw or movd) to a | |||
| 11437 | // zero vector. | |||
| 11438 | return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); | |||
| 11439 | } | |||
| 11440 | ||||
| 11441 | // We can't directly insert an i8 or i16 into a vector, so zero extend | |||
| 11442 | // it to i32 first. | |||
| 11443 | if (EltVT == MVT::i16 || EltVT == MVT::i8) { | |||
| 11444 | Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); | |||
| 11445 | MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); | |||
| 11446 | Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item); | |||
| 11447 | Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); | |||
| 11448 | return DAG.getBitcast(VT, Item); | |||
| 11449 | } | |||
| 11450 | } | |||
| 11451 | ||||
| 11452 | // Is it a vector logical left shift? | |||
| 11453 | if (NumElems == 2 && Idx == 1 && | |||
| 11454 | X86::isZeroNode(Op.getOperand(0)) && | |||
| 11455 | !X86::isZeroNode(Op.getOperand(1))) { | |||
| 11456 | unsigned NumBits = VT.getSizeInBits(); | |||
| 11457 | return getVShift(true, VT, | |||
| 11458 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, | |||
| 11459 | VT, Op.getOperand(1)), | |||
| 11460 | NumBits/2, DAG, *this, dl); | |||
| 11461 | } | |||
| 11462 | ||||
| 11463 | if (IsAllConstants) // Otherwise, it's better to do a constpool load. | |||
| 11464 | return SDValue(); | |||
| 11465 | ||||
| 11466 | // Otherwise, if this is a vector with i32 or f32 elements, and the element | |||
| 11467 | // is a non-constant being inserted into an element other than the low one, | |||
| 11468 | // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka | |||
| 11469 | // movd/movss) to move this into the low element, then shuffle it into | |||
| 11470 | // place. | |||
| 11471 | if (EVTBits == 32) { | |||
| 11472 | Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); | |||
| 11473 | return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG); | |||
| 11474 | } | |||
| 11475 | } | |||
| 11476 | ||||
| 11477 | // Splat is obviously ok. Let legalizer expand it to a shuffle. | |||
| 11478 | if (Values.size() == 1) { | |||
| 11479 | if (EVTBits == 32) { | |||
| 11480 | // Instead of a shuffle like this: | |||
| 11481 | // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> | |||
| 11482 | // Check if it's possible to issue this instead. | |||
| 11483 | // shuffle (vload ptr)), undef, <1, 1, 1, 1> | |||
| 11484 | unsigned Idx = NonZeroMask.countr_zero(); | |||
| 11485 | SDValue Item = Op.getOperand(Idx); | |||
| 11486 | if (Op.getNode()->isOnlyUserOf(Item.getNode())) | |||
| 11487 | return LowerAsSplatVectorLoad(Item, VT, dl, DAG); | |||
| 11488 | } | |||
| 11489 | return SDValue(); | |||
| 11490 | } | |||
| 11491 | ||||
| 11492 | // A vector full of immediates; various special cases are already | |||
| 11493 | // handled, so this is best done with a single constant-pool load. | |||
| 11494 | if (IsAllConstants) | |||
| 11495 | return SDValue(); | |||
| 11496 | ||||
| 11497 | if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) | |||
| 11498 | return V; | |||
| 11499 | ||||
| 11500 | // See if we can use a vector load to get all of the elements. | |||
| 11501 | { | |||
| 11502 | SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); | |||
| 11503 | if (SDValue LD = | |||
| 11504 | EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) | |||
| 11505 | return LD; | |||
| 11506 | } | |||
| 11507 | ||||
| 11508 | // If this is a splat of pairs of 32-bit elements, we can use a narrower | |||
| 11509 | // build_vector and broadcast it. | |||
| 11510 | // TODO: We could probably generalize this more. | |||
| 11511 | if (Subtarget.hasAVX2() && EVTBits == 32 && Values.size() == 2) { | |||
| 11512 | SDValue Ops[4] = { Op.getOperand(0), Op.getOperand(1), | |||
| 11513 | DAG.getUNDEF(EltVT), DAG.getUNDEF(EltVT) }; | |||
| 11514 | auto CanSplat = [](SDValue Op, unsigned NumElems, ArrayRef<SDValue> Ops) { | |||
| 11515 | // Make sure all the even/odd operands match. | |||
| 11516 | for (unsigned i = 2; i != NumElems; ++i) | |||
| 11517 | if (Ops[i % 2] != Op.getOperand(i)) | |||
| 11518 | return false; | |||
| 11519 | return true; | |||
| 11520 | }; | |||
| 11521 | if (CanSplat(Op, NumElems, Ops)) { | |||
| 11522 | MVT WideEltVT = VT.isFloatingPoint() ? MVT::f64 : MVT::i64; | |||
| 11523 | MVT NarrowVT = MVT::getVectorVT(EltVT, 4); | |||
| 11524 | // Create a new build vector and cast to v2i64/v2f64. | |||
| 11525 | SDValue NewBV = DAG.getBitcast(MVT::getVectorVT(WideEltVT, 2), | |||
| 11526 | DAG.getBuildVector(NarrowVT, dl, Ops)); | |||
| 11527 | // Broadcast from v2i64/v2f64 and cast to final VT. | |||
| 11528 | MVT BcastVT = MVT::getVectorVT(WideEltVT, NumElems / 2); | |||
| 11529 | return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, dl, BcastVT, | |||
| 11530 | NewBV)); | |||
| 11531 | } | |||
| 11532 | } | |||
| 11533 | ||||
| 11534 | // For AVX-length vectors, build the individual 128-bit pieces and use | |||
| 11535 | // shuffles to put them in place. | |||
| 11536 | if (VT.getSizeInBits() > 128) { | |||
| 11537 | MVT HVT = MVT::getVectorVT(EltVT, NumElems / 2); | |||
| 11538 | ||||
| 11539 | // Build both the lower and upper subvector. | |||
| 11540 | SDValue Lower = | |||
| 11541 | DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); | |||
| 11542 | SDValue Upper = DAG.getBuildVector( | |||
| 11543 | HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); | |||
| 11544 | ||||
| 11545 | // Recreate the wider vector with the lower and upper part. | |||
| 11546 | return concatSubVectors(Lower, Upper, DAG, dl); | |||
| 11547 | } | |||
| 11548 | ||||
| 11549 | // Let legalizer expand 2-wide build_vectors. | |||
| 11550 | if (EVTBits == 64) { | |||
| 11551 | if (NumNonZero == 1) { | |||
| 11552 | // One half is zero or undef. | |||
| 11553 | unsigned Idx = NonZeroMask.countr_zero(); | |||
| 11554 | SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, | |||
| 11555 | Op.getOperand(Idx)); | |||
| 11556 | return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); | |||
| 11557 | } | |||
| 11558 | return SDValue(); | |||
| 11559 | } | |||
| 11560 | ||||
| 11561 | // If element VT is < 32 bits, convert it to inserts into a zero vector. | |||
| 11562 | if (EVTBits == 8 && NumElems == 16) | |||
| 11563 | if (SDValue V = LowerBuildVectorv16i8(Op, NonZeroMask, NumNonZero, NumZero, | |||
| 11564 | DAG, Subtarget)) | |||
| 11565 | return V; | |||
| 11566 | ||||
| 11567 | if (EltVT == MVT::i16 && NumElems == 8) | |||
| 11568 | if (SDValue V = LowerBuildVectorv8i16(Op, NonZeroMask, NumNonZero, NumZero, | |||
| 11569 | DAG, Subtarget)) | |||
| 11570 | return V; | |||
| 11571 | ||||
| 11572 | // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS | |||
| 11573 | if (EVTBits == 32 && NumElems == 4) | |||
| 11574 | if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget)) | |||
| 11575 | return V; | |||
| 11576 | ||||
| 11577 | // If element VT is == 32 bits, turn it into a number of shuffles. | |||
| 11578 | if (NumElems == 4 && NumZero > 0) { | |||
| 11579 | SmallVector<SDValue, 8> Ops(NumElems); | |||
| 11580 | for (unsigned i = 0; i < 4; ++i) { | |||
| 11581 | bool isZero = !NonZeroMask[i]; | |||
| 11582 | if (isZero) | |||
| 11583 | Ops[i] = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 11584 | else | |||
| 11585 | Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); | |||
| 11586 | } | |||
| 11587 | ||||
| 11588 | for (unsigned i = 0; i < 2; ++i) { | |||
| 11589 | switch (NonZeroMask.extractBitsAsZExtValue(2, i * 2)) { | |||
| 11590 | default: llvm_unreachable("Unexpected NonZero count")::llvm::llvm_unreachable_internal("Unexpected NonZero count", "llvm/lib/Target/X86/X86ISelLowering.cpp", 11590); | |||
| 11591 | case 0: | |||
| 11592 | Ops[i] = Ops[i*2]; // Must be a zero vector. | |||
| 11593 | break; | |||
| 11594 | case 1: | |||
| 11595 | Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]); | |||
| 11596 | break; | |||
| 11597 | case 2: | |||
| 11598 | Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); | |||
| 11599 | break; | |||
| 11600 | case 3: | |||
| 11601 | Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]); | |||
| 11602 | break; | |||
| 11603 | } | |||
| 11604 | } | |||
| 11605 | ||||
| 11606 | bool Reverse1 = NonZeroMask.extractBitsAsZExtValue(2, 0) == 2; | |||
| 11607 | bool Reverse2 = NonZeroMask.extractBitsAsZExtValue(2, 2) == 2; | |||
| 11608 | int MaskVec[] = { | |||
| 11609 | Reverse1 ? 1 : 0, | |||
| 11610 | Reverse1 ? 0 : 1, | |||
| 11611 | static_cast<int>(Reverse2 ? NumElems+1 : NumElems), | |||
| 11612 | static_cast<int>(Reverse2 ? NumElems : NumElems+1) | |||
| 11613 | }; | |||
| 11614 | return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); | |||
| 11615 | } | |||
| 11616 | ||||
| 11617 | assert(Values.size() > 1 && "Expected non-undef and non-splat vector")(static_cast <bool> (Values.size() > 1 && "Expected non-undef and non-splat vector" ) ? void (0) : __assert_fail ("Values.size() > 1 && \"Expected non-undef and non-splat vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11617, __extension__ __PRETTY_FUNCTION__)); | |||
| 11618 | ||||
| 11619 | // Check for a build vector from mostly shuffle plus few inserting. | |||
| 11620 | if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) | |||
| 11621 | return Sh; | |||
| 11622 | ||||
| 11623 | // For SSE 4.1, use insertps to put the high elements into the low element. | |||
| 11624 | if (Subtarget.hasSSE41() && EltVT != MVT::f16) { | |||
| 11625 | SDValue Result; | |||
| 11626 | if (!Op.getOperand(0).isUndef()) | |||
| 11627 | Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); | |||
| 11628 | else | |||
| 11629 | Result = DAG.getUNDEF(VT); | |||
| 11630 | ||||
| 11631 | for (unsigned i = 1; i < NumElems; ++i) { | |||
| 11632 | if (Op.getOperand(i).isUndef()) continue; | |||
| 11633 | Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, | |||
| 11634 | Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); | |||
| 11635 | } | |||
| 11636 | return Result; | |||
| 11637 | } | |||
| 11638 | ||||
| 11639 | // Otherwise, expand into a number of unpckl*, start by extending each of | |||
| 11640 | // our (non-undef) elements to the full vector width with the element in the | |||
| 11641 | // bottom slot of the vector (which generates no code for SSE). | |||
| 11642 | SmallVector<SDValue, 8> Ops(NumElems); | |||
| 11643 | for (unsigned i = 0; i < NumElems; ++i) { | |||
| 11644 | if (!Op.getOperand(i).isUndef()) | |||
| 11645 | Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); | |||
| 11646 | else | |||
| 11647 | Ops[i] = DAG.getUNDEF(VT); | |||
| 11648 | } | |||
| 11649 | ||||
| 11650 | // Next, we iteratively mix elements, e.g. for v4f32: | |||
| 11651 | // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> | |||
| 11652 | // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> | |||
| 11653 | // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> | |||
| 11654 | for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { | |||
| 11655 | // Generate scaled UNPCKL shuffle mask. | |||
| 11656 | SmallVector<int, 16> Mask; | |||
| 11657 | for(unsigned i = 0; i != Scale; ++i) | |||
| 11658 | Mask.push_back(i); | |||
| 11659 | for (unsigned i = 0; i != Scale; ++i) | |||
| 11660 | Mask.push_back(NumElems+i); | |||
| 11661 | Mask.append(NumElems - Mask.size(), SM_SentinelUndef); | |||
| 11662 | ||||
| 11663 | for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) | |||
| 11664 | Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); | |||
| 11665 | } | |||
| 11666 | return Ops[0]; | |||
| 11667 | } | |||
| 11668 | ||||
| 11669 | // 256-bit AVX can use the vinsertf128 instruction | |||
| 11670 | // to create 256-bit vectors from two other 128-bit ones. | |||
| 11671 | // TODO: Detect subvector broadcast here instead of DAG combine? | |||
| 11672 | static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, | |||
| 11673 | const X86Subtarget &Subtarget) { | |||
| 11674 | SDLoc dl(Op); | |||
| 11675 | MVT ResVT = Op.getSimpleValueType(); | |||
| 11676 | ||||
| 11677 | assert((ResVT.is256BitVector() ||(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector ()) && "Value type must be 256-/512-bit wide") ? void (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__ __PRETTY_FUNCTION__)) | |||
| 11678 | ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide")(static_cast <bool> ((ResVT.is256BitVector() || ResVT.is512BitVector ()) && "Value type must be 256-/512-bit wide") ? void (0) : __assert_fail ("(ResVT.is256BitVector() || ResVT.is512BitVector()) && \"Value type must be 256-/512-bit wide\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11678, __extension__ __PRETTY_FUNCTION__)); | |||
| 11679 | ||||
| 11680 | unsigned NumOperands = Op.getNumOperands(); | |||
| 11681 | unsigned NumFreezeUndef = 0; | |||
| 11682 | unsigned NumZero = 0; | |||
| 11683 | unsigned NumNonZero = 0; | |||
| 11684 | unsigned NonZeros = 0; | |||
| 11685 | for (unsigned i = 0; i != NumOperands; ++i) { | |||
| 11686 | SDValue SubVec = Op.getOperand(i); | |||
| 11687 | if (SubVec.isUndef()) | |||
| 11688 | continue; | |||
| 11689 | if (ISD::isFreezeUndef(SubVec.getNode())) { | |||
| 11690 | // If the freeze(undef) has multiple uses then we must fold to zero. | |||
| 11691 | if (SubVec.hasOneUse()) | |||
| 11692 | ++NumFreezeUndef; | |||
| 11693 | else | |||
| 11694 | ++NumZero; | |||
| 11695 | } | |||
| 11696 | else if (ISD::isBuildVectorAllZeros(SubVec.getNode())) | |||
| 11697 | ++NumZero; | |||
| 11698 | else { | |||
| 11699 | assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void (0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 11699, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range. | |||
| 11700 | NonZeros |= 1 << i; | |||
| 11701 | ++NumNonZero; | |||
| 11702 | } | |||
| 11703 | } | |||
| 11704 | ||||
| 11705 | // If we have more than 2 non-zeros, build each half separately. | |||
| 11706 | if (NumNonZero > 2) { | |||
| 11707 | MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); | |||
| 11708 | ArrayRef<SDUse> Ops = Op->ops(); | |||
| 11709 | SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, | |||
| 11710 | Ops.slice(0, NumOperands/2)); | |||
| 11711 | SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, | |||
| 11712 | Ops.slice(NumOperands/2)); | |||
| 11713 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); | |||
| 11714 | } | |||
| 11715 | ||||
| 11716 | // Otherwise, build it up through insert_subvectors. | |||
| 11717 | SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) | |||
| 11718 | : (NumFreezeUndef ? DAG.getFreeze(DAG.getUNDEF(ResVT)) | |||
| 11719 | : DAG.getUNDEF(ResVT)); | |||
| 11720 | ||||
| 11721 | MVT SubVT = Op.getOperand(0).getSimpleValueType(); | |||
| 11722 | unsigned NumSubElems = SubVT.getVectorNumElements(); | |||
| 11723 | for (unsigned i = 0; i != NumOperands; ++i) { | |||
| 11724 | if ((NonZeros & (1 << i)) == 0) | |||
| 11725 | continue; | |||
| 11726 | ||||
| 11727 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, | |||
| 11728 | Op.getOperand(i), | |||
| 11729 | DAG.getIntPtrConstant(i * NumSubElems, dl)); | |||
| 11730 | } | |||
| 11731 | ||||
| 11732 | return Vec; | |||
| 11733 | } | |||
| 11734 | ||||
| 11735 | // Returns true if the given node is a type promotion (by concatenating i1 | |||
| 11736 | // zeros) of the result of a node that already zeros all upper bits of | |||
| 11737 | // k-register. | |||
| 11738 | // TODO: Merge this with LowerAVXCONCAT_VECTORS? | |||
| 11739 | static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, | |||
| 11740 | const X86Subtarget &Subtarget, | |||
| 11741 | SelectionDAG & DAG) { | |||
| 11742 | SDLoc dl(Op); | |||
| 11743 | MVT ResVT = Op.getSimpleValueType(); | |||
| 11744 | unsigned NumOperands = Op.getNumOperands(); | |||
| 11745 | ||||
| 11746 | assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&(static_cast <bool> (NumOperands > 1 && isPowerOf2_32 (NumOperands) && "Unexpected number of operands in CONCAT_VECTORS" ) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__ __PRETTY_FUNCTION__)) | |||
| 11747 | "Unexpected number of operands in CONCAT_VECTORS")(static_cast <bool> (NumOperands > 1 && isPowerOf2_32 (NumOperands) && "Unexpected number of operands in CONCAT_VECTORS" ) ? void (0) : __assert_fail ("NumOperands > 1 && isPowerOf2_32(NumOperands) && \"Unexpected number of operands in CONCAT_VECTORS\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11747, __extension__ __PRETTY_FUNCTION__)); | |||
| 11748 | ||||
| 11749 | uint64_t Zeros = 0; | |||
| 11750 | uint64_t NonZeros = 0; | |||
| 11751 | for (unsigned i = 0; i != NumOperands; ++i) { | |||
| 11752 | SDValue SubVec = Op.getOperand(i); | |||
| 11753 | if (SubVec.isUndef()) | |||
| 11754 | continue; | |||
| 11755 | assert(i < sizeof(NonZeros) * CHAR_BIT)(static_cast <bool> (i < sizeof(NonZeros) * 8) ? void (0) : __assert_fail ("i < sizeof(NonZeros) * CHAR_BIT", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 11755, __extension__ __PRETTY_FUNCTION__)); // Ensure the shift is in range. | |||
| 11756 | if (ISD::isBuildVectorAllZeros(SubVec.getNode())) | |||
| 11757 | Zeros |= (uint64_t)1 << i; | |||
| 11758 | else | |||
| 11759 | NonZeros |= (uint64_t)1 << i; | |||
| 11760 | } | |||
| 11761 | ||||
| 11762 | unsigned NumElems = ResVT.getVectorNumElements(); | |||
| 11763 | ||||
| 11764 | // If we are inserting non-zero vector and there are zeros in LSBs and undef | |||
| 11765 | // in the MSBs we need to emit a KSHIFTL. The generic lowering to | |||
| 11766 | // insert_subvector will give us two kshifts. | |||
| 11767 | if (isPowerOf2_64(NonZeros) && Zeros != 0 && NonZeros > Zeros && | |||
| 11768 | Log2_64(NonZeros) != NumOperands - 1) { | |||
| 11769 | MVT ShiftVT = ResVT; | |||
| 11770 | if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) | |||
| 11771 | ShiftVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 11772 | unsigned Idx = Log2_64(NonZeros); | |||
| 11773 | SDValue SubVec = Op.getOperand(Idx); | |||
| 11774 | unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); | |||
| 11775 | SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ShiftVT, | |||
| 11776 | DAG.getUNDEF(ShiftVT), SubVec, | |||
| 11777 | DAG.getIntPtrConstant(0, dl)); | |||
| 11778 | Op = DAG.getNode(X86ISD::KSHIFTL, dl, ShiftVT, SubVec, | |||
| 11779 | DAG.getTargetConstant(Idx * SubVecNumElts, dl, MVT::i8)); | |||
| 11780 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Op, | |||
| 11781 | DAG.getIntPtrConstant(0, dl)); | |||
| 11782 | } | |||
| 11783 | ||||
| 11784 | // If there are zero or one non-zeros we can handle this very simply. | |||
| 11785 | if (NonZeros == 0 || isPowerOf2_64(NonZeros)) { | |||
| 11786 | SDValue Vec = Zeros ? DAG.getConstant(0, dl, ResVT) : DAG.getUNDEF(ResVT); | |||
| 11787 | if (!NonZeros) | |||
| 11788 | return Vec; | |||
| 11789 | unsigned Idx = Log2_64(NonZeros); | |||
| 11790 | SDValue SubVec = Op.getOperand(Idx); | |||
| 11791 | unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); | |||
| 11792 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, | |||
| 11793 | DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); | |||
| 11794 | } | |||
| 11795 | ||||
| 11796 | if (NumOperands > 2) { | |||
| 11797 | MVT HalfVT = ResVT.getHalfNumVectorElementsVT(); | |||
| 11798 | ArrayRef<SDUse> Ops = Op->ops(); | |||
| 11799 | SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, | |||
| 11800 | Ops.slice(0, NumOperands/2)); | |||
| 11801 | SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, | |||
| 11802 | Ops.slice(NumOperands/2)); | |||
| 11803 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); | |||
| 11804 | } | |||
| 11805 | ||||
| 11806 | assert(llvm::popcount(NonZeros) == 2 && "Simple cases not handled?")(static_cast <bool> (llvm::popcount(NonZeros) == 2 && "Simple cases not handled?") ? void (0) : __assert_fail ("llvm::popcount(NonZeros) == 2 && \"Simple cases not handled?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11806, __extension__ __PRETTY_FUNCTION__)); | |||
| 11807 | ||||
| 11808 | if (ResVT.getVectorNumElements() >= 16) | |||
| 11809 | return Op; // The operation is legal with KUNPCK | |||
| 11810 | ||||
| 11811 | SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, | |||
| 11812 | DAG.getUNDEF(ResVT), Op.getOperand(0), | |||
| 11813 | DAG.getIntPtrConstant(0, dl)); | |||
| 11814 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), | |||
| 11815 | DAG.getIntPtrConstant(NumElems/2, dl)); | |||
| 11816 | } | |||
| 11817 | ||||
| 11818 | static SDValue LowerCONCAT_VECTORS(SDValue Op, | |||
| 11819 | const X86Subtarget &Subtarget, | |||
| 11820 | SelectionDAG &DAG) { | |||
| 11821 | MVT VT = Op.getSimpleValueType(); | |||
| 11822 | if (VT.getVectorElementType() == MVT::i1) | |||
| 11823 | return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG); | |||
| 11824 | ||||
| 11825 | assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||(static_cast <bool> ((VT.is256BitVector() && Op .getNumOperands() == 2) || (VT.is512BitVector() && (Op .getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void ( 0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__ __PRETTY_FUNCTION__)) | |||
| 11826 | (VT.is512BitVector() && (Op.getNumOperands() == 2 ||(static_cast <bool> ((VT.is256BitVector() && Op .getNumOperands() == 2) || (VT.is512BitVector() && (Op .getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void ( 0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__ __PRETTY_FUNCTION__)) | |||
| 11827 | Op.getNumOperands() == 4)))(static_cast <bool> ((VT.is256BitVector() && Op .getNumOperands() == 2) || (VT.is512BitVector() && (Op .getNumOperands() == 2 || Op.getNumOperands() == 4))) ? void ( 0) : __assert_fail ("(VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11827, __extension__ __PRETTY_FUNCTION__)); | |||
| 11828 | ||||
| 11829 | // AVX can use the vinsertf128 instruction to create 256-bit vectors | |||
| 11830 | // from two other 128-bit ones. | |||
| 11831 | ||||
| 11832 | // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors | |||
| 11833 | return LowerAVXCONCAT_VECTORS(Op, DAG, Subtarget); | |||
| 11834 | } | |||
| 11835 | ||||
| 11836 | //===----------------------------------------------------------------------===// | |||
| 11837 | // Vector shuffle lowering | |||
| 11838 | // | |||
| 11839 | // This is an experimental code path for lowering vector shuffles on x86. It is | |||
| 11840 | // designed to handle arbitrary vector shuffles and blends, gracefully | |||
| 11841 | // degrading performance as necessary. It works hard to recognize idiomatic | |||
| 11842 | // shuffles and lower them to optimal instruction patterns without leaving | |||
| 11843 | // a framework that allows reasonably efficient handling of all vector shuffle | |||
| 11844 | // patterns. | |||
| 11845 | //===----------------------------------------------------------------------===// | |||
| 11846 | ||||
| 11847 | /// Tiny helper function to identify a no-op mask. | |||
| 11848 | /// | |||
| 11849 | /// This is a somewhat boring predicate function. It checks whether the mask | |||
| 11850 | /// array input, which is assumed to be a single-input shuffle mask of the kind | |||
| 11851 | /// used by the X86 shuffle instructions (not a fully general | |||
| 11852 | /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an | |||
| 11853 | /// in-place shuffle are 'no-op's. | |||
| 11854 | static bool isNoopShuffleMask(ArrayRef<int> Mask) { | |||
| 11855 | for (int i = 0, Size = Mask.size(); i < Size; ++i) { | |||
| 11856 | assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!" ) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11856, __extension__ __PRETTY_FUNCTION__)); | |||
| 11857 | if (Mask[i] >= 0 && Mask[i] != i) | |||
| 11858 | return false; | |||
| 11859 | } | |||
| 11860 | return true; | |||
| 11861 | } | |||
| 11862 | ||||
| 11863 | /// Test whether there are elements crossing LaneSizeInBits lanes in this | |||
| 11864 | /// shuffle mask. | |||
| 11865 | /// | |||
| 11866 | /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations | |||
| 11867 | /// and we routinely test for these. | |||
| 11868 | static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits, | |||
| 11869 | unsigned ScalarSizeInBits, | |||
| 11870 | ArrayRef<int> Mask) { | |||
| 11871 | assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__ __PRETTY_FUNCTION__)) | |||
| 11872 | (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__ __PRETTY_FUNCTION__)) | |||
| 11873 | "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11873, __extension__ __PRETTY_FUNCTION__)); | |||
| 11874 | int LaneSize = LaneSizeInBits / ScalarSizeInBits; | |||
| 11875 | int Size = Mask.size(); | |||
| 11876 | for (int i = 0; i < Size; ++i) | |||
| 11877 | if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) | |||
| 11878 | return true; | |||
| 11879 | return false; | |||
| 11880 | } | |||
| 11881 | ||||
| 11882 | /// Test whether there are elements crossing 128-bit lanes in this | |||
| 11883 | /// shuffle mask. | |||
| 11884 | static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) { | |||
| 11885 | return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask); | |||
| 11886 | } | |||
| 11887 | ||||
| 11888 | /// Test whether elements in each LaneSizeInBits lane in this shuffle mask come | |||
| 11889 | /// from multiple lanes - this is different to isLaneCrossingShuffleMask to | |||
| 11890 | /// better support 'repeated mask + lane permute' style shuffles. | |||
| 11891 | static bool isMultiLaneShuffleMask(unsigned LaneSizeInBits, | |||
| 11892 | unsigned ScalarSizeInBits, | |||
| 11893 | ArrayRef<int> Mask) { | |||
| 11894 | assert(LaneSizeInBits && ScalarSizeInBits &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__ __PRETTY_FUNCTION__)) | |||
| 11895 | (LaneSizeInBits % ScalarSizeInBits) == 0 &&(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__ __PRETTY_FUNCTION__)) | |||
| 11896 | "Illegal shuffle lane size")(static_cast <bool> (LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && "Illegal shuffle lane size") ? void (0) : __assert_fail ("LaneSizeInBits && ScalarSizeInBits && (LaneSizeInBits % ScalarSizeInBits) == 0 && \"Illegal shuffle lane size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11896, __extension__ __PRETTY_FUNCTION__)); | |||
| 11897 | int NumElts = Mask.size(); | |||
| 11898 | int NumEltsPerLane = LaneSizeInBits / ScalarSizeInBits; | |||
| 11899 | int NumLanes = NumElts / NumEltsPerLane; | |||
| 11900 | if (NumLanes > 1) { | |||
| 11901 | for (int i = 0; i != NumLanes; ++i) { | |||
| 11902 | int SrcLane = -1; | |||
| 11903 | for (int j = 0; j != NumEltsPerLane; ++j) { | |||
| 11904 | int M = Mask[(i * NumEltsPerLane) + j]; | |||
| 11905 | if (M < 0) | |||
| 11906 | continue; | |||
| 11907 | int Lane = (M % NumElts) / NumEltsPerLane; | |||
| 11908 | if (SrcLane >= 0 && SrcLane != Lane) | |||
| 11909 | return true; | |||
| 11910 | SrcLane = Lane; | |||
| 11911 | } | |||
| 11912 | } | |||
| 11913 | } | |||
| 11914 | return false; | |||
| 11915 | } | |||
| 11916 | ||||
| 11917 | /// Test whether a shuffle mask is equivalent within each sub-lane. | |||
| 11918 | /// | |||
| 11919 | /// This checks a shuffle mask to see if it is performing the same | |||
| 11920 | /// lane-relative shuffle in each sub-lane. This trivially implies | |||
| 11921 | /// that it is also not lane-crossing. It may however involve a blend from the | |||
| 11922 | /// same lane of a second vector. | |||
| 11923 | /// | |||
| 11924 | /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is | |||
| 11925 | /// non-trivial to compute in the face of undef lanes. The representation is | |||
| 11926 | /// suitable for use with existing 128-bit shuffles as entries from the second | |||
| 11927 | /// vector have been remapped to [LaneSize, 2*LaneSize). | |||
| 11928 | static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT, | |||
| 11929 | ArrayRef<int> Mask, | |||
| 11930 | SmallVectorImpl<int> &RepeatedMask) { | |||
| 11931 | auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits(); | |||
| 11932 | RepeatedMask.assign(LaneSize, -1); | |||
| 11933 | int Size = Mask.size(); | |||
| 11934 | for (int i = 0; i < Size; ++i) { | |||
| 11935 | assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0)(static_cast <bool> (Mask[i] == SM_SentinelUndef || Mask [i] >= 0) ? void (0) : __assert_fail ("Mask[i] == SM_SentinelUndef || Mask[i] >= 0" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11935, __extension__ __PRETTY_FUNCTION__)); | |||
| 11936 | if (Mask[i] < 0) | |||
| 11937 | continue; | |||
| 11938 | if ((Mask[i] % Size) / LaneSize != i / LaneSize) | |||
| 11939 | // This entry crosses lanes, so there is no way to model this shuffle. | |||
| 11940 | return false; | |||
| 11941 | ||||
| 11942 | // Ok, handle the in-lane shuffles by detecting if and when they repeat. | |||
| 11943 | // Adjust second vector indices to start at LaneSize instead of Size. | |||
| 11944 | int LocalM = Mask[i] < Size ? Mask[i] % LaneSize | |||
| 11945 | : Mask[i] % LaneSize + LaneSize; | |||
| 11946 | if (RepeatedMask[i % LaneSize] < 0) | |||
| 11947 | // This is the first non-undef entry in this slot of a 128-bit lane. | |||
| 11948 | RepeatedMask[i % LaneSize] = LocalM; | |||
| 11949 | else if (RepeatedMask[i % LaneSize] != LocalM) | |||
| 11950 | // Found a mismatch with the repeated mask. | |||
| 11951 | return false; | |||
| 11952 | } | |||
| 11953 | return true; | |||
| 11954 | } | |||
| 11955 | ||||
| 11956 | /// Test whether a shuffle mask is equivalent within each 128-bit lane. | |||
| 11957 | static bool | |||
| 11958 | is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, | |||
| 11959 | SmallVectorImpl<int> &RepeatedMask) { | |||
| 11960 | return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); | |||
| 11961 | } | |||
| 11962 | ||||
| 11963 | static bool | |||
| 11964 | is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask) { | |||
| 11965 | SmallVector<int, 32> RepeatedMask; | |||
| 11966 | return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask); | |||
| 11967 | } | |||
| 11968 | ||||
| 11969 | /// Test whether a shuffle mask is equivalent within each 256-bit lane. | |||
| 11970 | static bool | |||
| 11971 | is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask, | |||
| 11972 | SmallVectorImpl<int> &RepeatedMask) { | |||
| 11973 | return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask); | |||
| 11974 | } | |||
| 11975 | ||||
| 11976 | /// Test whether a target shuffle mask is equivalent within each sub-lane. | |||
| 11977 | /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. | |||
| 11978 | static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, | |||
| 11979 | unsigned EltSizeInBits, | |||
| 11980 | ArrayRef<int> Mask, | |||
| 11981 | SmallVectorImpl<int> &RepeatedMask) { | |||
| 11982 | int LaneSize = LaneSizeInBits / EltSizeInBits; | |||
| 11983 | RepeatedMask.assign(LaneSize, SM_SentinelUndef); | |||
| 11984 | int Size = Mask.size(); | |||
| 11985 | for (int i = 0; i < Size; ++i) { | |||
| 11986 | assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0))(static_cast <bool> (isUndefOrZero(Mask[i]) || (Mask[i] >= 0)) ? void (0) : __assert_fail ("isUndefOrZero(Mask[i]) || (Mask[i] >= 0)" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 11986, __extension__ __PRETTY_FUNCTION__)); | |||
| 11987 | if (Mask[i] == SM_SentinelUndef) | |||
| 11988 | continue; | |||
| 11989 | if (Mask[i] == SM_SentinelZero) { | |||
| 11990 | if (!isUndefOrZero(RepeatedMask[i % LaneSize])) | |||
| 11991 | return false; | |||
| 11992 | RepeatedMask[i % LaneSize] = SM_SentinelZero; | |||
| 11993 | continue; | |||
| 11994 | } | |||
| 11995 | if ((Mask[i] % Size) / LaneSize != i / LaneSize) | |||
| 11996 | // This entry crosses lanes, so there is no way to model this shuffle. | |||
| 11997 | return false; | |||
| 11998 | ||||
| 11999 | // Handle the in-lane shuffles by detecting if and when they repeat. Adjust | |||
| 12000 | // later vector indices to start at multiples of LaneSize instead of Size. | |||
| 12001 | int LaneM = Mask[i] / Size; | |||
| 12002 | int LocalM = (Mask[i] % LaneSize) + (LaneM * LaneSize); | |||
| 12003 | if (RepeatedMask[i % LaneSize] == SM_SentinelUndef) | |||
| 12004 | // This is the first non-undef entry in this slot of a 128-bit lane. | |||
| 12005 | RepeatedMask[i % LaneSize] = LocalM; | |||
| 12006 | else if (RepeatedMask[i % LaneSize] != LocalM) | |||
| 12007 | // Found a mismatch with the repeated mask. | |||
| 12008 | return false; | |||
| 12009 | } | |||
| 12010 | return true; | |||
| 12011 | } | |||
| 12012 | ||||
| 12013 | /// Test whether a target shuffle mask is equivalent within each sub-lane. | |||
| 12014 | /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero. | |||
| 12015 | static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT, | |||
| 12016 | ArrayRef<int> Mask, | |||
| 12017 | SmallVectorImpl<int> &RepeatedMask) { | |||
| 12018 | return isRepeatedTargetShuffleMask(LaneSizeInBits, VT.getScalarSizeInBits(), | |||
| 12019 | Mask, RepeatedMask); | |||
| 12020 | } | |||
| 12021 | ||||
| 12022 | /// Checks whether the vector elements referenced by two shuffle masks are | |||
| 12023 | /// equivalent. | |||
| 12024 | static bool IsElementEquivalent(int MaskSize, SDValue Op, SDValue ExpectedOp, | |||
| 12025 | int Idx, int ExpectedIdx) { | |||
| 12026 | assert(0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx &&(static_cast <bool> (0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index") ? void (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__ __PRETTY_FUNCTION__)) | |||
| 12027 | ExpectedIdx < MaskSize && "Out of range element index")(static_cast <bool> (0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && "Out of range element index") ? void (0) : __assert_fail ("0 <= Idx && Idx < MaskSize && 0 <= ExpectedIdx && ExpectedIdx < MaskSize && \"Out of range element index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12027, __extension__ __PRETTY_FUNCTION__)); | |||
| 12028 | if (!Op || !ExpectedOp || Op.getOpcode() != ExpectedOp.getOpcode()) | |||
| 12029 | return false; | |||
| 12030 | ||||
| 12031 | switch (Op.getOpcode()) { | |||
| 12032 | case ISD::BUILD_VECTOR: | |||
| 12033 | // If the values are build vectors, we can look through them to find | |||
| 12034 | // equivalent inputs that make the shuffles equivalent. | |||
| 12035 | // TODO: Handle MaskSize != Op.getNumOperands()? | |||
| 12036 | if (MaskSize == (int)Op.getNumOperands() && | |||
| 12037 | MaskSize == (int)ExpectedOp.getNumOperands()) | |||
| 12038 | return Op.getOperand(Idx) == ExpectedOp.getOperand(ExpectedIdx); | |||
| 12039 | break; | |||
| 12040 | case X86ISD::VBROADCAST: | |||
| 12041 | case X86ISD::VBROADCAST_LOAD: | |||
| 12042 | // TODO: Handle MaskSize != Op.getValueType().getVectorNumElements()? | |||
| 12043 | return (Op == ExpectedOp && | |||
| 12044 | (int)Op.getValueType().getVectorNumElements() == MaskSize); | |||
| 12045 | case X86ISD::HADD: | |||
| 12046 | case X86ISD::HSUB: | |||
| 12047 | case X86ISD::FHADD: | |||
| 12048 | case X86ISD::FHSUB: | |||
| 12049 | case X86ISD::PACKSS: | |||
| 12050 | case X86ISD::PACKUS: | |||
| 12051 | // HOP(X,X) can refer to the elt from the lower/upper half of a lane. | |||
| 12052 | // TODO: Handle MaskSize != NumElts? | |||
| 12053 | // TODO: Handle HOP(X,Y) vs HOP(Y,X) equivalence cases. | |||
| 12054 | if (Op == ExpectedOp && Op.getOperand(0) == Op.getOperand(1)) { | |||
| 12055 | MVT VT = Op.getSimpleValueType(); | |||
| 12056 | int NumElts = VT.getVectorNumElements(); | |||
| 12057 | if (MaskSize == NumElts) { | |||
| 12058 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 12059 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 12060 | int NumHalfEltsPerLane = NumEltsPerLane / 2; | |||
| 12061 | bool SameLane = | |||
| 12062 | (Idx / NumEltsPerLane) == (ExpectedIdx / NumEltsPerLane); | |||
| 12063 | bool SameElt = | |||
| 12064 | (Idx % NumHalfEltsPerLane) == (ExpectedIdx % NumHalfEltsPerLane); | |||
| 12065 | return SameLane && SameElt; | |||
| 12066 | } | |||
| 12067 | } | |||
| 12068 | break; | |||
| 12069 | } | |||
| 12070 | ||||
| 12071 | return false; | |||
| 12072 | } | |||
| 12073 | ||||
| 12074 | /// Checks whether a shuffle mask is equivalent to an explicit list of | |||
| 12075 | /// arguments. | |||
| 12076 | /// | |||
| 12077 | /// This is a fast way to test a shuffle mask against a fixed pattern: | |||
| 12078 | /// | |||
| 12079 | /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... } | |||
| 12080 | /// | |||
| 12081 | /// It returns true if the mask is exactly as wide as the argument list, and | |||
| 12082 | /// each element of the mask is either -1 (signifying undef) or the value given | |||
| 12083 | /// in the argument. | |||
| 12084 | static bool isShuffleEquivalent(ArrayRef<int> Mask, ArrayRef<int> ExpectedMask, | |||
| 12085 | SDValue V1 = SDValue(), | |||
| 12086 | SDValue V2 = SDValue()) { | |||
| 12087 | int Size = Mask.size(); | |||
| 12088 | if (Size != (int)ExpectedMask.size()) | |||
| 12089 | return false; | |||
| 12090 | ||||
| 12091 | for (int i = 0; i < Size; ++i) { | |||
| 12092 | assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!" ) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12092, __extension__ __PRETTY_FUNCTION__)); | |||
| 12093 | int MaskIdx = Mask[i]; | |||
| 12094 | int ExpectedIdx = ExpectedMask[i]; | |||
| 12095 | if (0 <= MaskIdx && MaskIdx != ExpectedIdx) { | |||
| 12096 | SDValue MaskV = MaskIdx < Size ? V1 : V2; | |||
| 12097 | SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; | |||
| 12098 | MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); | |||
| 12099 | ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); | |||
| 12100 | if (!IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) | |||
| 12101 | return false; | |||
| 12102 | } | |||
| 12103 | } | |||
| 12104 | return true; | |||
| 12105 | } | |||
| 12106 | ||||
| 12107 | /// Checks whether a target shuffle mask is equivalent to an explicit pattern. | |||
| 12108 | /// | |||
| 12109 | /// The masks must be exactly the same width. | |||
| 12110 | /// | |||
| 12111 | /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding | |||
| 12112 | /// value in ExpectedMask is always accepted. Otherwise the indices must match. | |||
| 12113 | /// | |||
| 12114 | /// SM_SentinelZero is accepted as a valid negative index but must match in | |||
| 12115 | /// both, or via a known bits test. | |||
| 12116 | static bool isTargetShuffleEquivalent(MVT VT, ArrayRef<int> Mask, | |||
| 12117 | ArrayRef<int> ExpectedMask, | |||
| 12118 | const SelectionDAG &DAG, | |||
| 12119 | SDValue V1 = SDValue(), | |||
| 12120 | SDValue V2 = SDValue()) { | |||
| 12121 | int Size = Mask.size(); | |||
| 12122 | if (Size != (int)ExpectedMask.size()) | |||
| 12123 | return false; | |||
| 12124 | assert(llvm::all_of(ExpectedMask,(static_cast <bool> (llvm::all_of(ExpectedMask, [Size]( int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask" ) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__ __PRETTY_FUNCTION__)) | |||
| 12125 | [Size](int M) { return isInRange(M, 0, 2 * Size); }) &&(static_cast <bool> (llvm::all_of(ExpectedMask, [Size]( int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask" ) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__ __PRETTY_FUNCTION__)) | |||
| 12126 | "Illegal target shuffle mask")(static_cast <bool> (llvm::all_of(ExpectedMask, [Size]( int M) { return isInRange(M, 0, 2 * Size); }) && "Illegal target shuffle mask" ) ? void (0) : __assert_fail ("llvm::all_of(ExpectedMask, [Size](int M) { return isInRange(M, 0, 2 * Size); }) && \"Illegal target shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12126, __extension__ __PRETTY_FUNCTION__)); | |||
| 12127 | ||||
| 12128 | // Check for out-of-range target shuffle mask indices. | |||
| 12129 | if (!isUndefOrZeroOrInRange(Mask, 0, 2 * Size)) | |||
| 12130 | return false; | |||
| 12131 | ||||
| 12132 | // Don't use V1/V2 if they're not the same size as the shuffle mask type. | |||
| 12133 | if (V1 && V1.getValueSizeInBits() != VT.getSizeInBits()) | |||
| 12134 | V1 = SDValue(); | |||
| 12135 | if (V2 && V2.getValueSizeInBits() != VT.getSizeInBits()) | |||
| 12136 | V2 = SDValue(); | |||
| 12137 | ||||
| 12138 | APInt ZeroV1 = APInt::getZero(Size); | |||
| 12139 | APInt ZeroV2 = APInt::getZero(Size); | |||
| 12140 | ||||
| 12141 | for (int i = 0; i < Size; ++i) { | |||
| 12142 | int MaskIdx = Mask[i]; | |||
| 12143 | int ExpectedIdx = ExpectedMask[i]; | |||
| 12144 | if (MaskIdx == SM_SentinelUndef || MaskIdx == ExpectedIdx) | |||
| 12145 | continue; | |||
| 12146 | if (MaskIdx == SM_SentinelZero) { | |||
| 12147 | // If we need this expected index to be a zero element, then update the | |||
| 12148 | // relevant zero mask and perform the known bits at the end to minimize | |||
| 12149 | // repeated computes. | |||
| 12150 | SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; | |||
| 12151 | if (ExpectedV && | |||
| 12152 | Size == (int)ExpectedV.getValueType().getVectorNumElements()) { | |||
| 12153 | int BitIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); | |||
| 12154 | APInt &ZeroMask = ExpectedIdx < Size ? ZeroV1 : ZeroV2; | |||
| 12155 | ZeroMask.setBit(BitIdx); | |||
| 12156 | continue; | |||
| 12157 | } | |||
| 12158 | } | |||
| 12159 | if (MaskIdx >= 0) { | |||
| 12160 | SDValue MaskV = MaskIdx < Size ? V1 : V2; | |||
| 12161 | SDValue ExpectedV = ExpectedIdx < Size ? V1 : V2; | |||
| 12162 | MaskIdx = MaskIdx < Size ? MaskIdx : (MaskIdx - Size); | |||
| 12163 | ExpectedIdx = ExpectedIdx < Size ? ExpectedIdx : (ExpectedIdx - Size); | |||
| 12164 | if (IsElementEquivalent(Size, MaskV, ExpectedV, MaskIdx, ExpectedIdx)) | |||
| 12165 | continue; | |||
| 12166 | } | |||
| 12167 | return false; | |||
| 12168 | } | |||
| 12169 | return (ZeroV1.isZero() || DAG.MaskedVectorIsZero(V1, ZeroV1)) && | |||
| 12170 | (ZeroV2.isZero() || DAG.MaskedVectorIsZero(V2, ZeroV2)); | |||
| 12171 | } | |||
| 12172 | ||||
| 12173 | // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd | |||
| 12174 | // instructions. | |||
| 12175 | static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT, | |||
| 12176 | const SelectionDAG &DAG) { | |||
| 12177 | if (VT != MVT::v8i32 && VT != MVT::v8f32) | |||
| 12178 | return false; | |||
| 12179 | ||||
| 12180 | SmallVector<int, 8> Unpcklwd; | |||
| 12181 | createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true, | |||
| 12182 | /* Unary = */ false); | |||
| 12183 | SmallVector<int, 8> Unpckhwd; | |||
| 12184 | createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false, | |||
| 12185 | /* Unary = */ false); | |||
| 12186 | bool IsUnpackwdMask = (isTargetShuffleEquivalent(VT, Mask, Unpcklwd, DAG) || | |||
| 12187 | isTargetShuffleEquivalent(VT, Mask, Unpckhwd, DAG)); | |||
| 12188 | return IsUnpackwdMask; | |||
| 12189 | } | |||
| 12190 | ||||
| 12191 | static bool is128BitUnpackShuffleMask(ArrayRef<int> Mask, | |||
| 12192 | const SelectionDAG &DAG) { | |||
| 12193 | // Create 128-bit vector type based on mask size. | |||
| 12194 | MVT EltVT = MVT::getIntegerVT(128 / Mask.size()); | |||
| 12195 | MVT VT = MVT::getVectorVT(EltVT, Mask.size()); | |||
| 12196 | ||||
| 12197 | // We can't assume a canonical shuffle mask, so try the commuted version too. | |||
| 12198 | SmallVector<int, 4> CommutedMask(Mask); | |||
| 12199 | ShuffleVectorSDNode::commuteMask(CommutedMask); | |||
| 12200 | ||||
| 12201 | // Match any of unary/binary or low/high. | |||
| 12202 | for (unsigned i = 0; i != 4; ++i) { | |||
| 12203 | SmallVector<int, 16> UnpackMask; | |||
| 12204 | createUnpackShuffleMask(VT, UnpackMask, (i >> 1) % 2, i % 2); | |||
| 12205 | if (isTargetShuffleEquivalent(VT, Mask, UnpackMask, DAG) || | |||
| 12206 | isTargetShuffleEquivalent(VT, CommutedMask, UnpackMask, DAG)) | |||
| 12207 | return true; | |||
| 12208 | } | |||
| 12209 | return false; | |||
| 12210 | } | |||
| 12211 | ||||
| 12212 | /// Return true if a shuffle mask chooses elements identically in its top and | |||
| 12213 | /// bottom halves. For example, any splat mask has the same top and bottom | |||
| 12214 | /// halves. If an element is undefined in only one half of the mask, the halves | |||
| 12215 | /// are not considered identical. | |||
| 12216 | static bool hasIdenticalHalvesShuffleMask(ArrayRef<int> Mask) { | |||
| 12217 | assert(Mask.size() % 2 == 0 && "Expecting even number of elements in mask")(static_cast <bool> (Mask.size() % 2 == 0 && "Expecting even number of elements in mask" ) ? void (0) : __assert_fail ("Mask.size() % 2 == 0 && \"Expecting even number of elements in mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12217, __extension__ __PRETTY_FUNCTION__)); | |||
| 12218 | unsigned HalfSize = Mask.size() / 2; | |||
| 12219 | for (unsigned i = 0; i != HalfSize; ++i) { | |||
| 12220 | if (Mask[i] != Mask[i + HalfSize]) | |||
| 12221 | return false; | |||
| 12222 | } | |||
| 12223 | return true; | |||
| 12224 | } | |||
| 12225 | ||||
| 12226 | /// Get a 4-lane 8-bit shuffle immediate for a mask. | |||
| 12227 | /// | |||
| 12228 | /// This helper function produces an 8-bit shuffle immediate corresponding to | |||
| 12229 | /// the ubiquitous shuffle encoding scheme used in x86 instructions for | |||
| 12230 | /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for | |||
| 12231 | /// example. | |||
| 12232 | /// | |||
| 12233 | /// NB: We rely heavily on "undef" masks preserving the input lane. | |||
| 12234 | static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) { | |||
| 12235 | assert(Mask.size() == 4 && "Only 4-lane shuffle masks")(static_cast <bool> (Mask.size() == 4 && "Only 4-lane shuffle masks" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Only 4-lane shuffle masks\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12235, __extension__ __PRETTY_FUNCTION__)); | |||
| 12236 | assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0 ] < 4 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 4 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12236, __extension__ __PRETTY_FUNCTION__)); | |||
| 12237 | assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1 ] < 4 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 4 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12237, __extension__ __PRETTY_FUNCTION__)); | |||
| 12238 | assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2 ] < 4 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 4 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12238, __extension__ __PRETTY_FUNCTION__)); | |||
| 12239 | assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3 ] < 4 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 4 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12239, __extension__ __PRETTY_FUNCTION__)); | |||
| 12240 | ||||
| 12241 | // If the mask only uses one non-undef element, then fully 'splat' it to | |||
| 12242 | // improve later broadcast matching. | |||
| 12243 | int FirstIndex = find_if(Mask, [](int M) { return M >= 0; }) - Mask.begin(); | |||
| 12244 | assert(0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask")(static_cast <bool> (0 <= FirstIndex && FirstIndex < 4 && "All undef shuffle mask") ? void (0) : __assert_fail ("0 <= FirstIndex && FirstIndex < 4 && \"All undef shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12244, __extension__ __PRETTY_FUNCTION__)); | |||
| 12245 | ||||
| 12246 | int FirstElt = Mask[FirstIndex]; | |||
| 12247 | if (all_of(Mask, [FirstElt](int M) { return M < 0 || M == FirstElt; })) | |||
| 12248 | return (FirstElt << 6) | (FirstElt << 4) | (FirstElt << 2) | FirstElt; | |||
| 12249 | ||||
| 12250 | unsigned Imm = 0; | |||
| 12251 | Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0; | |||
| 12252 | Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2; | |||
| 12253 | Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4; | |||
| 12254 | Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6; | |||
| 12255 | return Imm; | |||
| 12256 | } | |||
| 12257 | ||||
| 12258 | static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL, | |||
| 12259 | SelectionDAG &DAG) { | |||
| 12260 | return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8); | |||
| 12261 | } | |||
| 12262 | ||||
| 12263 | // The Shuffle result is as follow: | |||
| 12264 | // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order. | |||
| 12265 | // Each Zeroable's element correspond to a particular Mask's element. | |||
| 12266 | // As described in computeZeroableShuffleElements function. | |||
| 12267 | // | |||
| 12268 | // The function looks for a sub-mask that the nonzero elements are in | |||
| 12269 | // increasing order. If such sub-mask exist. The function returns true. | |||
| 12270 | static bool isNonZeroElementsInOrder(const APInt &Zeroable, | |||
| 12271 | ArrayRef<int> Mask, const EVT &VectorType, | |||
| 12272 | bool &IsZeroSideLeft) { | |||
| 12273 | int NextElement = -1; | |||
| 12274 | // Check if the Mask's nonzero elements are in increasing order. | |||
| 12275 | for (int i = 0, e = Mask.size(); i < e; i++) { | |||
| 12276 | // Checks if the mask's zeros elements are built from only zeros. | |||
| 12277 | assert(Mask[i] >= -1 && "Out of bound mask element!")(static_cast <bool> (Mask[i] >= -1 && "Out of bound mask element!" ) ? void (0) : __assert_fail ("Mask[i] >= -1 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12277, __extension__ __PRETTY_FUNCTION__)); | |||
| 12278 | if (Mask[i] < 0) | |||
| 12279 | return false; | |||
| 12280 | if (Zeroable[i]) | |||
| 12281 | continue; | |||
| 12282 | // Find the lowest non zero element | |||
| 12283 | if (NextElement < 0) { | |||
| 12284 | NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0; | |||
| 12285 | IsZeroSideLeft = NextElement != 0; | |||
| 12286 | } | |||
| 12287 | // Exit if the mask's non zero elements are not in increasing order. | |||
| 12288 | if (NextElement != Mask[i]) | |||
| 12289 | return false; | |||
| 12290 | NextElement++; | |||
| 12291 | } | |||
| 12292 | return true; | |||
| 12293 | } | |||
| 12294 | ||||
| 12295 | /// Try to lower a shuffle with a single PSHUFB of V1 or V2. | |||
| 12296 | static SDValue lowerShuffleWithPSHUFB(const SDLoc &DL, MVT VT, | |||
| 12297 | ArrayRef<int> Mask, SDValue V1, | |||
| 12298 | SDValue V2, const APInt &Zeroable, | |||
| 12299 | const X86Subtarget &Subtarget, | |||
| 12300 | SelectionDAG &DAG) { | |||
| 12301 | int Size = Mask.size(); | |||
| 12302 | int LaneSize = 128 / VT.getScalarSizeInBits(); | |||
| 12303 | const int NumBytes = VT.getSizeInBits() / 8; | |||
| 12304 | const int NumEltBytes = VT.getScalarSizeInBits() / 8; | |||
| 12305 | ||||
| 12306 | assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT .is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector ()) || (Subtarget.hasBWI() && VT.is512BitVector())) ? void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__ __PRETTY_FUNCTION__)) | |||
| 12307 | (Subtarget.hasAVX2() && VT.is256BitVector()) ||(static_cast <bool> ((Subtarget.hasSSSE3() && VT .is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector ()) || (Subtarget.hasBWI() && VT.is512BitVector())) ? void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__ __PRETTY_FUNCTION__)) | |||
| 12308 | (Subtarget.hasBWI() && VT.is512BitVector()))(static_cast <bool> ((Subtarget.hasSSSE3() && VT .is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector ()) || (Subtarget.hasBWI() && VT.is512BitVector())) ? void (0) : __assert_fail ("(Subtarget.hasSSSE3() && VT.is128BitVector()) || (Subtarget.hasAVX2() && VT.is256BitVector()) || (Subtarget.hasBWI() && VT.is512BitVector())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12308, __extension__ __PRETTY_FUNCTION__)); | |||
| 12309 | ||||
| 12310 | SmallVector<SDValue, 64> PSHUFBMask(NumBytes); | |||
| 12311 | // Sign bit set in i8 mask means zero element. | |||
| 12312 | SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8); | |||
| 12313 | ||||
| 12314 | SDValue V; | |||
| 12315 | for (int i = 0; i < NumBytes; ++i) { | |||
| 12316 | int M = Mask[i / NumEltBytes]; | |||
| 12317 | if (M < 0) { | |||
| 12318 | PSHUFBMask[i] = DAG.getUNDEF(MVT::i8); | |||
| 12319 | continue; | |||
| 12320 | } | |||
| 12321 | if (Zeroable[i / NumEltBytes]) { | |||
| 12322 | PSHUFBMask[i] = ZeroMask; | |||
| 12323 | continue; | |||
| 12324 | } | |||
| 12325 | ||||
| 12326 | // We can only use a single input of V1 or V2. | |||
| 12327 | SDValue SrcV = (M >= Size ? V2 : V1); | |||
| 12328 | if (V && V != SrcV) | |||
| 12329 | return SDValue(); | |||
| 12330 | V = SrcV; | |||
| 12331 | M %= Size; | |||
| 12332 | ||||
| 12333 | // PSHUFB can't cross lanes, ensure this doesn't happen. | |||
| 12334 | if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize)) | |||
| 12335 | return SDValue(); | |||
| 12336 | ||||
| 12337 | M = M % LaneSize; | |||
| 12338 | M = M * NumEltBytes + (i % NumEltBytes); | |||
| 12339 | PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8); | |||
| 12340 | } | |||
| 12341 | assert(V && "Failed to find a source input")(static_cast <bool> (V && "Failed to find a source input" ) ? void (0) : __assert_fail ("V && \"Failed to find a source input\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12341, __extension__ __PRETTY_FUNCTION__)); | |||
| 12342 | ||||
| 12343 | MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes); | |||
| 12344 | return DAG.getBitcast( | |||
| 12345 | VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V), | |||
| 12346 | DAG.getBuildVector(I8VT, DL, PSHUFBMask))); | |||
| 12347 | } | |||
| 12348 | ||||
| 12349 | static SDValue getMaskNode(SDValue Mask, MVT MaskVT, | |||
| 12350 | const X86Subtarget &Subtarget, SelectionDAG &DAG, | |||
| 12351 | const SDLoc &dl); | |||
| 12352 | ||||
| 12353 | // X86 has dedicated shuffle that can be lowered to VEXPAND | |||
| 12354 | static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT, | |||
| 12355 | const APInt &Zeroable, | |||
| 12356 | ArrayRef<int> Mask, SDValue &V1, | |||
| 12357 | SDValue &V2, SelectionDAG &DAG, | |||
| 12358 | const X86Subtarget &Subtarget) { | |||
| 12359 | bool IsLeftZeroSide = true; | |||
| 12360 | if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(), | |||
| 12361 | IsLeftZeroSide)) | |||
| 12362 | return SDValue(); | |||
| 12363 | unsigned VEXPANDMask = (~Zeroable).getZExtValue(); | |||
| 12364 | MVT IntegerType = | |||
| 12365 | MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8)); | |||
| 12366 | SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType); | |||
| 12367 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 12368 | assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements") ? void (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__ __PRETTY_FUNCTION__)) | |||
| 12369 | "Unexpected number of vector elements")(static_cast <bool> ((NumElts == 4 || NumElts == 8 || NumElts == 16) && "Unexpected number of vector elements") ? void (0) : __assert_fail ("(NumElts == 4 || NumElts == 8 || NumElts == 16) && \"Unexpected number of vector elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12369, __extension__ __PRETTY_FUNCTION__)); | |||
| 12370 | SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts), | |||
| 12371 | Subtarget, DAG, DL); | |||
| 12372 | SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); | |||
| 12373 | SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; | |||
| 12374 | return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask); | |||
| 12375 | } | |||
| 12376 | ||||
| 12377 | static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, | |||
| 12378 | unsigned &UnpackOpcode, bool IsUnary, | |||
| 12379 | ArrayRef<int> TargetMask, const SDLoc &DL, | |||
| 12380 | SelectionDAG &DAG, | |||
| 12381 | const X86Subtarget &Subtarget) { | |||
| 12382 | int NumElts = VT.getVectorNumElements(); | |||
| 12383 | ||||
| 12384 | bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true; | |||
| 12385 | for (int i = 0; i != NumElts; i += 2) { | |||
| 12386 | int M1 = TargetMask[i + 0]; | |||
| 12387 | int M2 = TargetMask[i + 1]; | |||
| 12388 | Undef1 &= (SM_SentinelUndef == M1); | |||
| 12389 | Undef2 &= (SM_SentinelUndef == M2); | |||
| 12390 | Zero1 &= isUndefOrZero(M1); | |||
| 12391 | Zero2 &= isUndefOrZero(M2); | |||
| 12392 | } | |||
| 12393 | assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&(static_cast <bool> (!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected") ? void (0 ) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__ __PRETTY_FUNCTION__)) | |||
| 12394 | "Zeroable shuffle detected")(static_cast <bool> (!((Undef1 || Zero1) && (Undef2 || Zero2)) && "Zeroable shuffle detected") ? void (0 ) : __assert_fail ("!((Undef1 || Zero1) && (Undef2 || Zero2)) && \"Zeroable shuffle detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12394, __extension__ __PRETTY_FUNCTION__)); | |||
| 12395 | ||||
| 12396 | // Attempt to match the target mask against the unpack lo/hi mask patterns. | |||
| 12397 | SmallVector<int, 64> Unpckl, Unpckh; | |||
| 12398 | createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary); | |||
| 12399 | if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG, V1, | |||
| 12400 | (IsUnary ? V1 : V2))) { | |||
| 12401 | UnpackOpcode = X86ISD::UNPCKL; | |||
| 12402 | V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); | |||
| 12403 | V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); | |||
| 12404 | return true; | |||
| 12405 | } | |||
| 12406 | ||||
| 12407 | createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary); | |||
| 12408 | if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG, V1, | |||
| 12409 | (IsUnary ? V1 : V2))) { | |||
| 12410 | UnpackOpcode = X86ISD::UNPCKH; | |||
| 12411 | V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2)); | |||
| 12412 | V1 = (Undef1 ? DAG.getUNDEF(VT) : V1); | |||
| 12413 | return true; | |||
| 12414 | } | |||
| 12415 | ||||
| 12416 | // If an unary shuffle, attempt to match as an unpack lo/hi with zero. | |||
| 12417 | if (IsUnary && (Zero1 || Zero2)) { | |||
| 12418 | // Don't bother if we can blend instead. | |||
| 12419 | if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) && | |||
| 12420 | isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0)) | |||
| 12421 | return false; | |||
| 12422 | ||||
| 12423 | bool MatchLo = true, MatchHi = true; | |||
| 12424 | for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) { | |||
| 12425 | int M = TargetMask[i]; | |||
| 12426 | ||||
| 12427 | // Ignore if the input is known to be zero or the index is undef. | |||
| 12428 | if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) || | |||
| 12429 | (M == SM_SentinelUndef)) | |||
| 12430 | continue; | |||
| 12431 | ||||
| 12432 | MatchLo &= (M == Unpckl[i]); | |||
| 12433 | MatchHi &= (M == Unpckh[i]); | |||
| 12434 | } | |||
| 12435 | ||||
| 12436 | if (MatchLo || MatchHi) { | |||
| 12437 | UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; | |||
| 12438 | V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; | |||
| 12439 | V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1; | |||
| 12440 | return true; | |||
| 12441 | } | |||
| 12442 | } | |||
| 12443 | ||||
| 12444 | // If a binary shuffle, commute and try again. | |||
| 12445 | if (!IsUnary) { | |||
| 12446 | ShuffleVectorSDNode::commuteMask(Unpckl); | |||
| 12447 | if (isTargetShuffleEquivalent(VT, TargetMask, Unpckl, DAG)) { | |||
| 12448 | UnpackOpcode = X86ISD::UNPCKL; | |||
| 12449 | std::swap(V1, V2); | |||
| 12450 | return true; | |||
| 12451 | } | |||
| 12452 | ||||
| 12453 | ShuffleVectorSDNode::commuteMask(Unpckh); | |||
| 12454 | if (isTargetShuffleEquivalent(VT, TargetMask, Unpckh, DAG)) { | |||
| 12455 | UnpackOpcode = X86ISD::UNPCKH; | |||
| 12456 | std::swap(V1, V2); | |||
| 12457 | return true; | |||
| 12458 | } | |||
| 12459 | } | |||
| 12460 | ||||
| 12461 | return false; | |||
| 12462 | } | |||
| 12463 | ||||
| 12464 | // X86 has dedicated unpack instructions that can handle specific blend | |||
| 12465 | // operations: UNPCKH and UNPCKL. | |||
| 12466 | static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT, | |||
| 12467 | ArrayRef<int> Mask, SDValue V1, SDValue V2, | |||
| 12468 | SelectionDAG &DAG) { | |||
| 12469 | SmallVector<int, 8> Unpckl; | |||
| 12470 | createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false); | |||
| 12471 | if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) | |||
| 12472 | return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); | |||
| 12473 | ||||
| 12474 | SmallVector<int, 8> Unpckh; | |||
| 12475 | createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false); | |||
| 12476 | if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) | |||
| 12477 | return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); | |||
| 12478 | ||||
| 12479 | // Commute and try again. | |||
| 12480 | ShuffleVectorSDNode::commuteMask(Unpckl); | |||
| 12481 | if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) | |||
| 12482 | return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1); | |||
| 12483 | ||||
| 12484 | ShuffleVectorSDNode::commuteMask(Unpckh); | |||
| 12485 | if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) | |||
| 12486 | return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1); | |||
| 12487 | ||||
| 12488 | return SDValue(); | |||
| 12489 | } | |||
| 12490 | ||||
| 12491 | /// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit) | |||
| 12492 | /// followed by unpack 256-bit. | |||
| 12493 | static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT, | |||
| 12494 | ArrayRef<int> Mask, SDValue V1, | |||
| 12495 | SDValue V2, SelectionDAG &DAG) { | |||
| 12496 | SmallVector<int, 32> Unpckl, Unpckh; | |||
| 12497 | createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true); | |||
| 12498 | createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false); | |||
| 12499 | ||||
| 12500 | unsigned UnpackOpcode; | |||
| 12501 | if (isShuffleEquivalent(Mask, Unpckl, V1, V2)) | |||
| 12502 | UnpackOpcode = X86ISD::UNPCKL; | |||
| 12503 | else if (isShuffleEquivalent(Mask, Unpckh, V1, V2)) | |||
| 12504 | UnpackOpcode = X86ISD::UNPCKH; | |||
| 12505 | else | |||
| 12506 | return SDValue(); | |||
| 12507 | ||||
| 12508 | // This is a "natural" unpack operation (rather than the 128-bit sectored | |||
| 12509 | // operation implemented by AVX). We need to rearrange 64-bit chunks of the | |||
| 12510 | // input in order to use the x86 instruction. | |||
| 12511 | V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1), | |||
| 12512 | DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3}); | |||
| 12513 | V1 = DAG.getBitcast(VT, V1); | |||
| 12514 | return DAG.getNode(UnpackOpcode, DL, VT, V1, V1); | |||
| 12515 | } | |||
| 12516 | ||||
| 12517 | // Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the | |||
| 12518 | // source into the lower elements and zeroing the upper elements. | |||
| 12519 | static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT, | |||
| 12520 | ArrayRef<int> Mask, const APInt &Zeroable, | |||
| 12521 | const X86Subtarget &Subtarget) { | |||
| 12522 | if (!VT.is512BitVector() && !Subtarget.hasVLX()) | |||
| 12523 | return false; | |||
| 12524 | ||||
| 12525 | unsigned NumElts = Mask.size(); | |||
| 12526 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 12527 | unsigned MaxScale = 64 / EltSizeInBits; | |||
| 12528 | ||||
| 12529 | for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { | |||
| 12530 | unsigned SrcEltBits = EltSizeInBits * Scale; | |||
| 12531 | if (SrcEltBits < 32 && !Subtarget.hasBWI()) | |||
| 12532 | continue; | |||
| 12533 | unsigned NumSrcElts = NumElts / Scale; | |||
| 12534 | if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale)) | |||
| 12535 | continue; | |||
| 12536 | unsigned UpperElts = NumElts - NumSrcElts; | |||
| 12537 | if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) | |||
| 12538 | continue; | |||
| 12539 | SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale); | |||
| 12540 | SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts); | |||
| 12541 | DstVT = MVT::getIntegerVT(EltSizeInBits); | |||
| 12542 | if ((NumSrcElts * EltSizeInBits) >= 128) { | |||
| 12543 | // ISD::TRUNCATE | |||
| 12544 | DstVT = MVT::getVectorVT(DstVT, NumSrcElts); | |||
| 12545 | } else { | |||
| 12546 | // X86ISD::VTRUNC | |||
| 12547 | DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits); | |||
| 12548 | } | |||
| 12549 | return true; | |||
| 12550 | } | |||
| 12551 | ||||
| 12552 | return false; | |||
| 12553 | } | |||
| 12554 | ||||
| 12555 | // Helper to create TRUNCATE/VTRUNC nodes, optionally with zero/undef upper | |||
| 12556 | // element padding to the final DstVT. | |||
| 12557 | static SDValue getAVX512TruncNode(const SDLoc &DL, MVT DstVT, SDValue Src, | |||
| 12558 | const X86Subtarget &Subtarget, | |||
| 12559 | SelectionDAG &DAG, bool ZeroUppers) { | |||
| 12560 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 12561 | MVT DstSVT = DstVT.getScalarType(); | |||
| 12562 | unsigned NumDstElts = DstVT.getVectorNumElements(); | |||
| 12563 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 12564 | unsigned DstEltSizeInBits = DstVT.getScalarSizeInBits(); | |||
| 12565 | ||||
| 12566 | if (!DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) | |||
| 12567 | return SDValue(); | |||
| 12568 | ||||
| 12569 | // Perform a direct ISD::TRUNCATE if possible. | |||
| 12570 | if (NumSrcElts == NumDstElts) | |||
| 12571 | return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Src); | |||
| 12572 | ||||
| 12573 | if (NumSrcElts > NumDstElts) { | |||
| 12574 | MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); | |||
| 12575 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); | |||
| 12576 | return extractSubVector(Trunc, 0, DAG, DL, DstVT.getSizeInBits()); | |||
| 12577 | } | |||
| 12578 | ||||
| 12579 | if ((NumSrcElts * DstEltSizeInBits) >= 128) { | |||
| 12580 | MVT TruncVT = MVT::getVectorVT(DstSVT, NumSrcElts); | |||
| 12581 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Src); | |||
| 12582 | return widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, | |||
| 12583 | DstVT.getSizeInBits()); | |||
| 12584 | } | |||
| 12585 | ||||
| 12586 | // Non-VLX targets must truncate from a 512-bit type, so we need to | |||
| 12587 | // widen, truncate and then possibly extract the original subvector. | |||
| 12588 | if (!Subtarget.hasVLX() && !SrcVT.is512BitVector()) { | |||
| 12589 | SDValue NewSrc = widenSubVector(Src, ZeroUppers, Subtarget, DAG, DL, 512); | |||
| 12590 | return getAVX512TruncNode(DL, DstVT, NewSrc, Subtarget, DAG, ZeroUppers); | |||
| 12591 | } | |||
| 12592 | ||||
| 12593 | // Fallback to a X86ISD::VTRUNC, padding if necessary. | |||
| 12594 | MVT TruncVT = MVT::getVectorVT(DstSVT, 128 / DstEltSizeInBits); | |||
| 12595 | SDValue Trunc = DAG.getNode(X86ISD::VTRUNC, DL, TruncVT, Src); | |||
| 12596 | if (DstVT != TruncVT) | |||
| 12597 | Trunc = widenSubVector(Trunc, ZeroUppers, Subtarget, DAG, DL, | |||
| 12598 | DstVT.getSizeInBits()); | |||
| 12599 | return Trunc; | |||
| 12600 | } | |||
| 12601 | ||||
| 12602 | // Try to lower trunc+vector_shuffle to a vpmovdb or a vpmovdw instruction. | |||
| 12603 | // | |||
| 12604 | // An example is the following: | |||
| 12605 | // | |||
| 12606 | // t0: ch = EntryToken | |||
| 12607 | // t2: v4i64,ch = CopyFromReg t0, Register:v4i64 %0 | |||
| 12608 | // t25: v4i32 = truncate t2 | |||
| 12609 | // t41: v8i16 = bitcast t25 | |||
| 12610 | // t21: v8i16 = BUILD_VECTOR undef:i16, undef:i16, undef:i16, undef:i16, | |||
| 12611 | // Constant:i16<0>, Constant:i16<0>, Constant:i16<0>, Constant:i16<0> | |||
| 12612 | // t51: v8i16 = vector_shuffle<0,2,4,6,12,13,14,15> t41, t21 | |||
| 12613 | // t18: v2i64 = bitcast t51 | |||
| 12614 | // | |||
| 12615 | // One can just use a single vpmovdw instruction, without avx512vl we need to | |||
| 12616 | // use the zmm variant and extract the lower subvector, padding with zeroes. | |||
| 12617 | // TODO: Merge with lowerShuffleAsVTRUNC. | |||
| 12618 | static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 12619 | SDValue V2, ArrayRef<int> Mask, | |||
| 12620 | const APInt &Zeroable, | |||
| 12621 | const X86Subtarget &Subtarget, | |||
| 12622 | SelectionDAG &DAG) { | |||
| 12623 | assert((VT == MVT::v16i8 || VT == MVT::v8i16) && "Unexpected VTRUNC type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v8i16 ) && "Unexpected VTRUNC type") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v8i16) && \"Unexpected VTRUNC type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12623, __extension__ __PRETTY_FUNCTION__)); | |||
| 12624 | if (!Subtarget.hasAVX512()) | |||
| 12625 | return SDValue(); | |||
| 12626 | ||||
| 12627 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 12628 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 12629 | unsigned MaxScale = 64 / EltSizeInBits; | |||
| 12630 | for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { | |||
| 12631 | unsigned SrcEltBits = EltSizeInBits * Scale; | |||
| 12632 | unsigned NumSrcElts = NumElts / Scale; | |||
| 12633 | unsigned UpperElts = NumElts - NumSrcElts; | |||
| 12634 | if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale) || | |||
| 12635 | !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) | |||
| 12636 | continue; | |||
| 12637 | ||||
| 12638 | // Attempt to find a matching source truncation, but as a fall back VLX | |||
| 12639 | // cases can use the VPMOV directly. | |||
| 12640 | SDValue Src = peekThroughBitcasts(V1); | |||
| 12641 | if (Src.getOpcode() == ISD::TRUNCATE && | |||
| 12642 | Src.getScalarValueSizeInBits() == SrcEltBits) { | |||
| 12643 | Src = Src.getOperand(0); | |||
| 12644 | } else if (Subtarget.hasVLX()) { | |||
| 12645 | MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); | |||
| 12646 | MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); | |||
| 12647 | Src = DAG.getBitcast(SrcVT, Src); | |||
| 12648 | // Don't do this if PACKSS/PACKUS could perform it cheaper. | |||
| 12649 | if (Scale == 2 && | |||
| 12650 | ((DAG.ComputeNumSignBits(Src) > EltSizeInBits) || | |||
| 12651 | (DAG.computeKnownBits(Src).countMinLeadingZeros() >= EltSizeInBits))) | |||
| 12652 | return SDValue(); | |||
| 12653 | } else | |||
| 12654 | return SDValue(); | |||
| 12655 | ||||
| 12656 | // VPMOVWB is only available with avx512bw. | |||
| 12657 | if (!Subtarget.hasBWI() && Src.getScalarValueSizeInBits() < 32) | |||
| 12658 | return SDValue(); | |||
| 12659 | ||||
| 12660 | bool UndefUppers = isUndefInRange(Mask, NumSrcElts, UpperElts); | |||
| 12661 | return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); | |||
| 12662 | } | |||
| 12663 | ||||
| 12664 | return SDValue(); | |||
| 12665 | } | |||
| 12666 | ||||
| 12667 | // Attempt to match binary shuffle patterns as a truncate. | |||
| 12668 | static SDValue lowerShuffleAsVTRUNC(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 12669 | SDValue V2, ArrayRef<int> Mask, | |||
| 12670 | const APInt &Zeroable, | |||
| 12671 | const X86Subtarget &Subtarget, | |||
| 12672 | SelectionDAG &DAG) { | |||
| 12673 | assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector ()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__ __PRETTY_FUNCTION__)) | |||
| 12674 | "Unexpected VTRUNC type")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector ()) && "Unexpected VTRUNC type") ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unexpected VTRUNC type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12674, __extension__ __PRETTY_FUNCTION__)); | |||
| 12675 | if (!Subtarget.hasAVX512()) | |||
| 12676 | return SDValue(); | |||
| 12677 | ||||
| 12678 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 12679 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 12680 | unsigned MaxScale = 64 / EltSizeInBits; | |||
| 12681 | for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) { | |||
| 12682 | // TODO: Support non-BWI VPMOVWB truncations? | |||
| 12683 | unsigned SrcEltBits = EltSizeInBits * Scale; | |||
| 12684 | if (SrcEltBits < 32 && !Subtarget.hasBWI()) | |||
| 12685 | continue; | |||
| 12686 | ||||
| 12687 | // Match shuffle <Ofs,Ofs+Scale,Ofs+2*Scale,..,undef_or_zero,undef_or_zero> | |||
| 12688 | // Bail if the V2 elements are undef. | |||
| 12689 | unsigned NumHalfSrcElts = NumElts / Scale; | |||
| 12690 | unsigned NumSrcElts = 2 * NumHalfSrcElts; | |||
| 12691 | for (unsigned Offset = 0; Offset != Scale; ++Offset) { | |||
| 12692 | if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, Offset, Scale) || | |||
| 12693 | isUndefInRange(Mask, NumHalfSrcElts, NumHalfSrcElts)) | |||
| 12694 | continue; | |||
| 12695 | ||||
| 12696 | // The elements beyond the truncation must be undef/zero. | |||
| 12697 | unsigned UpperElts = NumElts - NumSrcElts; | |||
| 12698 | if (UpperElts > 0 && | |||
| 12699 | !Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnes()) | |||
| 12700 | continue; | |||
| 12701 | bool UndefUppers = | |||
| 12702 | UpperElts > 0 && isUndefInRange(Mask, NumSrcElts, UpperElts); | |||
| 12703 | ||||
| 12704 | // For offset truncations, ensure that the concat is cheap. | |||
| 12705 | if (Offset) { | |||
| 12706 | auto IsCheapConcat = [&](SDValue Lo, SDValue Hi) { | |||
| 12707 | if (Lo.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 12708 | Hi.getOpcode() == ISD::EXTRACT_SUBVECTOR) | |||
| 12709 | return Lo.getOperand(0) == Hi.getOperand(0); | |||
| 12710 | if (ISD::isNormalLoad(Lo.getNode()) && | |||
| 12711 | ISD::isNormalLoad(Hi.getNode())) { | |||
| 12712 | auto *LDLo = cast<LoadSDNode>(Lo); | |||
| 12713 | auto *LDHi = cast<LoadSDNode>(Hi); | |||
| 12714 | return DAG.areNonVolatileConsecutiveLoads( | |||
| 12715 | LDHi, LDLo, Lo.getValueType().getStoreSize(), 1); | |||
| 12716 | } | |||
| 12717 | return false; | |||
| 12718 | }; | |||
| 12719 | if (!IsCheapConcat(V1, V2)) | |||
| 12720 | continue; | |||
| 12721 | } | |||
| 12722 | ||||
| 12723 | // As we're using both sources then we need to concat them together | |||
| 12724 | // and truncate from the double-sized src. | |||
| 12725 | MVT ConcatVT = MVT::getVectorVT(VT.getScalarType(), NumElts * 2); | |||
| 12726 | SDValue Src = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatVT, V1, V2); | |||
| 12727 | ||||
| 12728 | MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); | |||
| 12729 | MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); | |||
| 12730 | Src = DAG.getBitcast(SrcVT, Src); | |||
| 12731 | ||||
| 12732 | // Shift the offset'd elements into place for the truncation. | |||
| 12733 | // TODO: Use getTargetVShiftByConstNode. | |||
| 12734 | if (Offset) | |||
| 12735 | Src = DAG.getNode( | |||
| 12736 | X86ISD::VSRLI, DL, SrcVT, Src, | |||
| 12737 | DAG.getTargetConstant(Offset * EltSizeInBits, DL, MVT::i8)); | |||
| 12738 | ||||
| 12739 | return getAVX512TruncNode(DL, VT, Src, Subtarget, DAG, !UndefUppers); | |||
| 12740 | } | |||
| 12741 | } | |||
| 12742 | ||||
| 12743 | return SDValue(); | |||
| 12744 | } | |||
| 12745 | ||||
| 12746 | /// Check whether a compaction lowering can be done by dropping even/odd | |||
| 12747 | /// elements and compute how many times even/odd elements must be dropped. | |||
| 12748 | /// | |||
| 12749 | /// This handles shuffles which take every Nth element where N is a power of | |||
| 12750 | /// two. Example shuffle masks: | |||
| 12751 | /// | |||
| 12752 | /// (even) | |||
| 12753 | /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14 | |||
| 12754 | /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30 | |||
| 12755 | /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12 | |||
| 12756 | /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28 | |||
| 12757 | /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8 | |||
| 12758 | /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24 | |||
| 12759 | /// | |||
| 12760 | /// (odd) | |||
| 12761 | /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 0, 2, 4, 6, 8, 10, 12, 14 | |||
| 12762 | /// N = 1: 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31 | |||
| 12763 | /// | |||
| 12764 | /// Any of these lanes can of course be undef. | |||
| 12765 | /// | |||
| 12766 | /// This routine only supports N <= 3. | |||
| 12767 | /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here | |||
| 12768 | /// for larger N. | |||
| 12769 | /// | |||
| 12770 | /// \returns N above, or the number of times even/odd elements must be dropped | |||
| 12771 | /// if there is such a number. Otherwise returns zero. | |||
| 12772 | static int canLowerByDroppingElements(ArrayRef<int> Mask, bool MatchEven, | |||
| 12773 | bool IsSingleInput) { | |||
| 12774 | // The modulus for the shuffle vector entries is based on whether this is | |||
| 12775 | // a single input or not. | |||
| 12776 | int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2); | |||
| 12777 | assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus ) && "We should only be called with masks with a power-of-2 size!" ) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__ __PRETTY_FUNCTION__)) | |||
| 12778 | "We should only be called with masks with a power-of-2 size!")(static_cast <bool> (isPowerOf2_32((uint32_t)ShuffleModulus ) && "We should only be called with masks with a power-of-2 size!" ) ? void (0) : __assert_fail ("isPowerOf2_32((uint32_t)ShuffleModulus) && \"We should only be called with masks with a power-of-2 size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12778, __extension__ __PRETTY_FUNCTION__)); | |||
| 12779 | ||||
| 12780 | uint64_t ModMask = (uint64_t)ShuffleModulus - 1; | |||
| 12781 | int Offset = MatchEven ? 0 : 1; | |||
| 12782 | ||||
| 12783 | // We track whether the input is viable for all power-of-2 strides 2^1, 2^2, | |||
| 12784 | // and 2^3 simultaneously. This is because we may have ambiguity with | |||
| 12785 | // partially undef inputs. | |||
| 12786 | bool ViableForN[3] = {true, true, true}; | |||
| 12787 | ||||
| 12788 | for (int i = 0, e = Mask.size(); i < e; ++i) { | |||
| 12789 | // Ignore undef lanes, we'll optimistically collapse them to the pattern we | |||
| 12790 | // want. | |||
| 12791 | if (Mask[i] < 0) | |||
| 12792 | continue; | |||
| 12793 | ||||
| 12794 | bool IsAnyViable = false; | |||
| 12795 | for (unsigned j = 0; j != std::size(ViableForN); ++j) | |||
| 12796 | if (ViableForN[j]) { | |||
| 12797 | uint64_t N = j + 1; | |||
| 12798 | ||||
| 12799 | // The shuffle mask must be equal to (i * 2^N) % M. | |||
| 12800 | if ((uint64_t)(Mask[i] - Offset) == (((uint64_t)i << N) & ModMask)) | |||
| 12801 | IsAnyViable = true; | |||
| 12802 | else | |||
| 12803 | ViableForN[j] = false; | |||
| 12804 | } | |||
| 12805 | // Early exit if we exhaust the possible powers of two. | |||
| 12806 | if (!IsAnyViable) | |||
| 12807 | break; | |||
| 12808 | } | |||
| 12809 | ||||
| 12810 | for (unsigned j = 0; j != std::size(ViableForN); ++j) | |||
| 12811 | if (ViableForN[j]) | |||
| 12812 | return j + 1; | |||
| 12813 | ||||
| 12814 | // Return 0 as there is no viable power of two. | |||
| 12815 | return 0; | |||
| 12816 | } | |||
| 12817 | ||||
| 12818 | // X86 has dedicated pack instructions that can handle specific truncation | |||
| 12819 | // operations: PACKSS and PACKUS. | |||
| 12820 | // Checks for compaction shuffle masks if MaxStages > 1. | |||
| 12821 | // TODO: Add support for matching multiple PACKSS/PACKUS stages. | |||
| 12822 | static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2, | |||
| 12823 | unsigned &PackOpcode, ArrayRef<int> TargetMask, | |||
| 12824 | const SelectionDAG &DAG, | |||
| 12825 | const X86Subtarget &Subtarget, | |||
| 12826 | unsigned MaxStages = 1) { | |||
| 12827 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 12828 | unsigned BitSize = VT.getScalarSizeInBits(); | |||
| 12829 | assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&(static_cast <bool> (0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && "Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__ __PRETTY_FUNCTION__)) | |||
| 12830 | "Illegal maximum compaction")(static_cast <bool> (0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && "Illegal maximum compaction") ? void (0) : __assert_fail ("0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 && \"Illegal maximum compaction\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12830, __extension__ __PRETTY_FUNCTION__)); | |||
| 12831 | ||||
| 12832 | auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) { | |||
| 12833 | unsigned NumSrcBits = PackVT.getScalarSizeInBits(); | |||
| 12834 | unsigned NumPackedBits = NumSrcBits - BitSize; | |||
| 12835 | N1 = peekThroughBitcasts(N1); | |||
| 12836 | N2 = peekThroughBitcasts(N2); | |||
| 12837 | unsigned NumBits1 = N1.getScalarValueSizeInBits(); | |||
| 12838 | unsigned NumBits2 = N2.getScalarValueSizeInBits(); | |||
| 12839 | bool IsZero1 = llvm::isNullOrNullSplat(N1, /*AllowUndefs*/ false); | |||
| 12840 | bool IsZero2 = llvm::isNullOrNullSplat(N2, /*AllowUndefs*/ false); | |||
| 12841 | if ((!N1.isUndef() && !IsZero1 && NumBits1 != NumSrcBits) || | |||
| 12842 | (!N2.isUndef() && !IsZero2 && NumBits2 != NumSrcBits)) | |||
| 12843 | return false; | |||
| 12844 | if (Subtarget.hasSSE41() || BitSize == 8) { | |||
| 12845 | APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits); | |||
| 12846 | if ((N1.isUndef() || IsZero1 || DAG.MaskedValueIsZero(N1, ZeroMask)) && | |||
| 12847 | (N2.isUndef() || IsZero2 || DAG.MaskedValueIsZero(N2, ZeroMask))) { | |||
| 12848 | V1 = N1; | |||
| 12849 | V2 = N2; | |||
| 12850 | SrcVT = PackVT; | |||
| 12851 | PackOpcode = X86ISD::PACKUS; | |||
| 12852 | return true; | |||
| 12853 | } | |||
| 12854 | } | |||
| 12855 | bool IsAllOnes1 = llvm::isAllOnesOrAllOnesSplat(N1, /*AllowUndefs*/ false); | |||
| 12856 | bool IsAllOnes2 = llvm::isAllOnesOrAllOnesSplat(N2, /*AllowUndefs*/ false); | |||
| 12857 | if ((N1.isUndef() || IsZero1 || IsAllOnes1 || | |||
| 12858 | DAG.ComputeNumSignBits(N1) > NumPackedBits) && | |||
| 12859 | (N2.isUndef() || IsZero2 || IsAllOnes2 || | |||
| 12860 | DAG.ComputeNumSignBits(N2) > NumPackedBits)) { | |||
| 12861 | V1 = N1; | |||
| 12862 | V2 = N2; | |||
| 12863 | SrcVT = PackVT; | |||
| 12864 | PackOpcode = X86ISD::PACKSS; | |||
| 12865 | return true; | |||
| 12866 | } | |||
| 12867 | return false; | |||
| 12868 | }; | |||
| 12869 | ||||
| 12870 | // Attempt to match against wider and wider compaction patterns. | |||
| 12871 | for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) { | |||
| 12872 | MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages); | |||
| 12873 | MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages); | |||
| 12874 | ||||
| 12875 | // Try binary shuffle. | |||
| 12876 | SmallVector<int, 32> BinaryMask; | |||
| 12877 | createPackShuffleMask(VT, BinaryMask, false, NumStages); | |||
| 12878 | if (isTargetShuffleEquivalent(VT, TargetMask, BinaryMask, DAG, V1, V2)) | |||
| 12879 | if (MatchPACK(V1, V2, PackVT)) | |||
| 12880 | return true; | |||
| 12881 | ||||
| 12882 | // Try unary shuffle. | |||
| 12883 | SmallVector<int, 32> UnaryMask; | |||
| 12884 | createPackShuffleMask(VT, UnaryMask, true, NumStages); | |||
| 12885 | if (isTargetShuffleEquivalent(VT, TargetMask, UnaryMask, DAG, V1)) | |||
| 12886 | if (MatchPACK(V1, V1, PackVT)) | |||
| 12887 | return true; | |||
| 12888 | } | |||
| 12889 | ||||
| 12890 | return false; | |||
| 12891 | } | |||
| 12892 | ||||
| 12893 | static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, | |||
| 12894 | SDValue V1, SDValue V2, SelectionDAG &DAG, | |||
| 12895 | const X86Subtarget &Subtarget) { | |||
| 12896 | MVT PackVT; | |||
| 12897 | unsigned PackOpcode; | |||
| 12898 | unsigned SizeBits = VT.getSizeInBits(); | |||
| 12899 | unsigned EltBits = VT.getScalarSizeInBits(); | |||
| 12900 | unsigned MaxStages = Log2_32(64 / EltBits); | |||
| 12901 | if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, | |||
| 12902 | Subtarget, MaxStages)) | |||
| 12903 | return SDValue(); | |||
| 12904 | ||||
| 12905 | unsigned CurrentEltBits = PackVT.getScalarSizeInBits(); | |||
| 12906 | unsigned NumStages = Log2_32(CurrentEltBits / EltBits); | |||
| 12907 | ||||
| 12908 | // Don't lower multi-stage packs on AVX512, truncation is better. | |||
| 12909 | if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX()) | |||
| 12910 | return SDValue(); | |||
| 12911 | ||||
| 12912 | // Pack to the largest type possible: | |||
| 12913 | // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. | |||
| 12914 | unsigned MaxPackBits = 16; | |||
| 12915 | if (CurrentEltBits > 16 && | |||
| 12916 | (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41())) | |||
| 12917 | MaxPackBits = 32; | |||
| 12918 | ||||
| 12919 | // Repeatedly pack down to the target size. | |||
| 12920 | SDValue Res; | |||
| 12921 | for (unsigned i = 0; i != NumStages; ++i) { | |||
| 12922 | unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits); | |||
| 12923 | unsigned NumSrcElts = SizeBits / SrcEltBits; | |||
| 12924 | MVT SrcSVT = MVT::getIntegerVT(SrcEltBits); | |||
| 12925 | MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2); | |||
| 12926 | MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts); | |||
| 12927 | MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2); | |||
| 12928 | Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1), | |||
| 12929 | DAG.getBitcast(SrcVT, V2)); | |||
| 12930 | V1 = V2 = Res; | |||
| 12931 | CurrentEltBits /= 2; | |||
| 12932 | } | |||
| 12933 | assert(Res && Res.getValueType() == VT &&(static_cast <bool> (Res && Res.getValueType() == VT && "Failed to lower compaction shuffle") ? void ( 0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__ __PRETTY_FUNCTION__)) | |||
| 12934 | "Failed to lower compaction shuffle")(static_cast <bool> (Res && Res.getValueType() == VT && "Failed to lower compaction shuffle") ? void ( 0) : __assert_fail ("Res && Res.getValueType() == VT && \"Failed to lower compaction shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 12934, __extension__ __PRETTY_FUNCTION__)); | |||
| 12935 | return Res; | |||
| 12936 | } | |||
| 12937 | ||||
| 12938 | /// Try to emit a bitmask instruction for a shuffle. | |||
| 12939 | /// | |||
| 12940 | /// This handles cases where we can model a blend exactly as a bitmask due to | |||
| 12941 | /// one of the inputs being zeroable. | |||
| 12942 | static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 12943 | SDValue V2, ArrayRef<int> Mask, | |||
| 12944 | const APInt &Zeroable, | |||
| 12945 | const X86Subtarget &Subtarget, | |||
| 12946 | SelectionDAG &DAG) { | |||
| 12947 | MVT MaskVT = VT; | |||
| 12948 | MVT EltVT = VT.getVectorElementType(); | |||
| 12949 | SDValue Zero, AllOnes; | |||
| 12950 | // Use f64 if i64 isn't legal. | |||
| 12951 | if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { | |||
| 12952 | EltVT = MVT::f64; | |||
| 12953 | MaskVT = MVT::getVectorVT(EltVT, Mask.size()); | |||
| 12954 | } | |||
| 12955 | ||||
| 12956 | MVT LogicVT = VT; | |||
| 12957 | if (EltVT == MVT::f32 || EltVT == MVT::f64) { | |||
| 12958 | Zero = DAG.getConstantFP(0.0, DL, EltVT); | |||
| 12959 | APFloat AllOnesValue = | |||
| 12960 | APFloat::getAllOnesValue(SelectionDAG::EVTToAPFloatSemantics(EltVT)); | |||
| 12961 | AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT); | |||
| 12962 | LogicVT = | |||
| 12963 | MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size()); | |||
| 12964 | } else { | |||
| 12965 | Zero = DAG.getConstant(0, DL, EltVT); | |||
| 12966 | AllOnes = DAG.getAllOnesConstant(DL, EltVT); | |||
| 12967 | } | |||
| 12968 | ||||
| 12969 | SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero); | |||
| 12970 | SDValue V; | |||
| 12971 | for (int i = 0, Size = Mask.size(); i < Size; ++i) { | |||
| 12972 | if (Zeroable[i]) | |||
| 12973 | continue; | |||
| 12974 | if (Mask[i] % Size != i) | |||
| 12975 | return SDValue(); // Not a blend. | |||
| 12976 | if (!V) | |||
| 12977 | V = Mask[i] < Size ? V1 : V2; | |||
| 12978 | else if (V != (Mask[i] < Size ? V1 : V2)) | |||
| 12979 | return SDValue(); // Can only let one input through the mask. | |||
| 12980 | ||||
| 12981 | VMaskOps[i] = AllOnes; | |||
| 12982 | } | |||
| 12983 | if (!V) | |||
| 12984 | return SDValue(); // No non-zeroable elements! | |||
| 12985 | ||||
| 12986 | SDValue VMask = DAG.getBuildVector(MaskVT, DL, VMaskOps); | |||
| 12987 | VMask = DAG.getBitcast(LogicVT, VMask); | |||
| 12988 | V = DAG.getBitcast(LogicVT, V); | |||
| 12989 | SDValue And = DAG.getNode(ISD::AND, DL, LogicVT, V, VMask); | |||
| 12990 | return DAG.getBitcast(VT, And); | |||
| 12991 | } | |||
| 12992 | ||||
| 12993 | /// Try to emit a blend instruction for a shuffle using bit math. | |||
| 12994 | /// | |||
| 12995 | /// This is used as a fallback approach when first class blend instructions are | |||
| 12996 | /// unavailable. Currently it is only suitable for integer vectors, but could | |||
| 12997 | /// be generalized for floating point vectors if desirable. | |||
| 12998 | static SDValue lowerShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 12999 | SDValue V2, ArrayRef<int> Mask, | |||
| 13000 | SelectionDAG &DAG) { | |||
| 13001 | assert(VT.isInteger() && "Only supports integer vector types!")(static_cast <bool> (VT.isInteger() && "Only supports integer vector types!" ) ? void (0) : __assert_fail ("VT.isInteger() && \"Only supports integer vector types!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13001, __extension__ __PRETTY_FUNCTION__)); | |||
| 13002 | MVT EltVT = VT.getVectorElementType(); | |||
| 13003 | SDValue Zero = DAG.getConstant(0, DL, EltVT); | |||
| 13004 | SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT); | |||
| 13005 | SmallVector<SDValue, 16> MaskOps; | |||
| 13006 | for (int i = 0, Size = Mask.size(); i < Size; ++i) { | |||
| 13007 | if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size) | |||
| 13008 | return SDValue(); // Shuffled input! | |||
| 13009 | MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero); | |||
| 13010 | } | |||
| 13011 | ||||
| 13012 | SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps); | |||
| 13013 | V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask); | |||
| 13014 | V2 = DAG.getNode(X86ISD::ANDNP, DL, VT, V1Mask, V2); | |||
| 13015 | return DAG.getNode(ISD::OR, DL, VT, V1, V2); | |||
| 13016 | } | |||
| 13017 | ||||
| 13018 | static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, | |||
| 13019 | SDValue PreservedSrc, | |||
| 13020 | const X86Subtarget &Subtarget, | |||
| 13021 | SelectionDAG &DAG); | |||
| 13022 | ||||
| 13023 | static bool matchShuffleAsBlend(MVT VT, SDValue V1, SDValue V2, | |||
| 13024 | MutableArrayRef<int> Mask, | |||
| 13025 | const APInt &Zeroable, bool &ForceV1Zero, | |||
| 13026 | bool &ForceV2Zero, uint64_t &BlendMask) { | |||
| 13027 | bool V1IsZeroOrUndef = | |||
| 13028 | V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode()); | |||
| 13029 | bool V2IsZeroOrUndef = | |||
| 13030 | V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode()); | |||
| 13031 | ||||
| 13032 | BlendMask = 0; | |||
| 13033 | ForceV1Zero = false, ForceV2Zero = false; | |||
| 13034 | assert(Mask.size() <= 64 && "Shuffle mask too big for blend mask")(static_cast <bool> (Mask.size() <= 64 && "Shuffle mask too big for blend mask" ) ? void (0) : __assert_fail ("Mask.size() <= 64 && \"Shuffle mask too big for blend mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13034, __extension__ __PRETTY_FUNCTION__)); | |||
| 13035 | ||||
| 13036 | int NumElts = Mask.size(); | |||
| 13037 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 13038 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 13039 | assert((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch")(static_cast <bool> ((NumLanes * NumEltsPerLane) == NumElts && "Value type mismatch") ? void (0) : __assert_fail ("(NumLanes * NumEltsPerLane) == NumElts && \"Value type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13039, __extension__ __PRETTY_FUNCTION__)); | |||
| 13040 | ||||
| 13041 | // For 32/64-bit elements, if we only reference one input (plus any undefs), | |||
| 13042 | // then ensure the blend mask part for that lane just references that input. | |||
| 13043 | bool ForceWholeLaneMasks = | |||
| 13044 | VT.is256BitVector() && VT.getScalarSizeInBits() >= 32; | |||
| 13045 | ||||
| 13046 | // Attempt to generate the binary blend mask. If an input is zero then | |||
| 13047 | // we can use any lane. | |||
| 13048 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 13049 | // Keep track of the inputs used per lane. | |||
| 13050 | bool LaneV1InUse = false; | |||
| 13051 | bool LaneV2InUse = false; | |||
| 13052 | uint64_t LaneBlendMask = 0; | |||
| 13053 | for (int LaneElt = 0; LaneElt != NumEltsPerLane; ++LaneElt) { | |||
| 13054 | int Elt = (Lane * NumEltsPerLane) + LaneElt; | |||
| 13055 | int M = Mask[Elt]; | |||
| 13056 | if (M == SM_SentinelUndef) | |||
| 13057 | continue; | |||
| 13058 | if (M == Elt || (0 <= M && M < NumElts && | |||
| 13059 | IsElementEquivalent(NumElts, V1, V1, M, Elt))) { | |||
| 13060 | Mask[Elt] = Elt; | |||
| 13061 | LaneV1InUse = true; | |||
| 13062 | continue; | |||
| 13063 | } | |||
| 13064 | if (M == (Elt + NumElts) || | |||
| 13065 | (NumElts <= M && | |||
| 13066 | IsElementEquivalent(NumElts, V2, V2, M - NumElts, Elt))) { | |||
| 13067 | LaneBlendMask |= 1ull << LaneElt; | |||
| 13068 | Mask[Elt] = Elt + NumElts; | |||
| 13069 | LaneV2InUse = true; | |||
| 13070 | continue; | |||
| 13071 | } | |||
| 13072 | if (Zeroable[Elt]) { | |||
| 13073 | if (V1IsZeroOrUndef) { | |||
| 13074 | ForceV1Zero = true; | |||
| 13075 | Mask[Elt] = Elt; | |||
| 13076 | LaneV1InUse = true; | |||
| 13077 | continue; | |||
| 13078 | } | |||
| 13079 | if (V2IsZeroOrUndef) { | |||
| 13080 | ForceV2Zero = true; | |||
| 13081 | LaneBlendMask |= 1ull << LaneElt; | |||
| 13082 | Mask[Elt] = Elt + NumElts; | |||
| 13083 | LaneV2InUse = true; | |||
| 13084 | continue; | |||
| 13085 | } | |||
| 13086 | } | |||
| 13087 | return false; | |||
| 13088 | } | |||
| 13089 | ||||
| 13090 | // If we only used V2 then splat the lane blend mask to avoid any demanded | |||
| 13091 | // elts from V1 in this lane (the V1 equivalent is implicit with a zero | |||
| 13092 | // blend mask bit). | |||
| 13093 | if (ForceWholeLaneMasks && LaneV2InUse && !LaneV1InUse) | |||
| 13094 | LaneBlendMask = (1ull << NumEltsPerLane) - 1; | |||
| 13095 | ||||
| 13096 | BlendMask |= LaneBlendMask << (Lane * NumEltsPerLane); | |||
| 13097 | } | |||
| 13098 | return true; | |||
| 13099 | } | |||
| 13100 | ||||
| 13101 | static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, | |||
| 13102 | int Scale) { | |||
| 13103 | uint64_t ScaledMask = 0; | |||
| 13104 | for (int i = 0; i != Size; ++i) | |||
| 13105 | if (BlendMask & (1ull << i)) | |||
| 13106 | ScaledMask |= ((1ull << Scale) - 1) << (i * Scale); | |||
| 13107 | return ScaledMask; | |||
| 13108 | } | |||
| 13109 | ||||
| 13110 | /// Try to emit a blend instruction for a shuffle. | |||
| 13111 | /// | |||
| 13112 | /// This doesn't do any checks for the availability of instructions for blending | |||
| 13113 | /// these values. It relies on the availability of the X86ISD::BLENDI pattern to | |||
| 13114 | /// be matched in the backend with the type given. What it does check for is | |||
| 13115 | /// that the shuffle mask is a blend, or convertible into a blend with zero. | |||
| 13116 | static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 13117 | SDValue V2, ArrayRef<int> Original, | |||
| 13118 | const APInt &Zeroable, | |||
| 13119 | const X86Subtarget &Subtarget, | |||
| 13120 | SelectionDAG &DAG) { | |||
| 13121 | uint64_t BlendMask = 0; | |||
| 13122 | bool ForceV1Zero = false, ForceV2Zero = false; | |||
| 13123 | SmallVector<int, 64> Mask(Original); | |||
| 13124 | if (!matchShuffleAsBlend(VT, V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero, | |||
| 13125 | BlendMask)) | |||
| 13126 | return SDValue(); | |||
| 13127 | ||||
| 13128 | // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. | |||
| 13129 | if (ForceV1Zero) | |||
| 13130 | V1 = getZeroVector(VT, Subtarget, DAG, DL); | |||
| 13131 | if (ForceV2Zero) | |||
| 13132 | V2 = getZeroVector(VT, Subtarget, DAG, DL); | |||
| 13133 | ||||
| 13134 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 13135 | ||||
| 13136 | switch (VT.SimpleTy) { | |||
| 13137 | case MVT::v4i64: | |||
| 13138 | case MVT::v8i32: | |||
| 13139 | assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit integer blends require AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit integer blends require AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13139, __extension__ __PRETTY_FUNCTION__)); | |||
| 13140 | [[fallthrough]]; | |||
| 13141 | case MVT::v4f64: | |||
| 13142 | case MVT::v8f32: | |||
| 13143 | assert(Subtarget.hasAVX() && "256-bit float blends require AVX!")(static_cast <bool> (Subtarget.hasAVX() && "256-bit float blends require AVX!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"256-bit float blends require AVX!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13143, __extension__ __PRETTY_FUNCTION__)); | |||
| 13144 | [[fallthrough]]; | |||
| 13145 | case MVT::v2f64: | |||
| 13146 | case MVT::v2i64: | |||
| 13147 | case MVT::v4f32: | |||
| 13148 | case MVT::v4i32: | |||
| 13149 | case MVT::v8i16: | |||
| 13150 | assert(Subtarget.hasSSE41() && "128-bit blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit blends require SSE41!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit blends require SSE41!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13150, __extension__ __PRETTY_FUNCTION__)); | |||
| 13151 | return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2, | |||
| 13152 | DAG.getTargetConstant(BlendMask, DL, MVT::i8)); | |||
| 13153 | case MVT::v16i16: { | |||
| 13154 | assert(Subtarget.hasAVX2() && "v16i16 blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "v16i16 blends require AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"v16i16 blends require AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13154, __extension__ __PRETTY_FUNCTION__)); | |||
| 13155 | SmallVector<int, 8> RepeatedMask; | |||
| 13156 | if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { | |||
| 13157 | // We can lower these with PBLENDW which is mirrored across 128-bit lanes. | |||
| 13158 | assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 && "Repeated mask size doesn't match!") ? void (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13158, __extension__ __PRETTY_FUNCTION__)); | |||
| 13159 | BlendMask = 0; | |||
| 13160 | for (int i = 0; i < 8; ++i) | |||
| 13161 | if (RepeatedMask[i] >= 8) | |||
| 13162 | BlendMask |= 1ull << i; | |||
| 13163 | return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, | |||
| 13164 | DAG.getTargetConstant(BlendMask, DL, MVT::i8)); | |||
| 13165 | } | |||
| 13166 | // Use PBLENDW for lower/upper lanes and then blend lanes. | |||
| 13167 | // TODO - we should allow 2 PBLENDW here and leave shuffle combine to | |||
| 13168 | // merge to VSELECT where useful. | |||
| 13169 | uint64_t LoMask = BlendMask & 0xFF; | |||
| 13170 | uint64_t HiMask = (BlendMask >> 8) & 0xFF; | |||
| 13171 | if (LoMask == 0 || LoMask == 255 || HiMask == 0 || HiMask == 255) { | |||
| 13172 | SDValue Lo = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, | |||
| 13173 | DAG.getTargetConstant(LoMask, DL, MVT::i8)); | |||
| 13174 | SDValue Hi = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2, | |||
| 13175 | DAG.getTargetConstant(HiMask, DL, MVT::i8)); | |||
| 13176 | return DAG.getVectorShuffle( | |||
| 13177 | MVT::v16i16, DL, Lo, Hi, | |||
| 13178 | {0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31}); | |||
| 13179 | } | |||
| 13180 | [[fallthrough]]; | |||
| 13181 | } | |||
| 13182 | case MVT::v32i8: | |||
| 13183 | assert(Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "256-bit byte-blends require AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"256-bit byte-blends require AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13183, __extension__ __PRETTY_FUNCTION__)); | |||
| 13184 | [[fallthrough]]; | |||
| 13185 | case MVT::v16i8: { | |||
| 13186 | assert(Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!")(static_cast <bool> (Subtarget.hasSSE41() && "128-bit byte-blends require SSE41!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"128-bit byte-blends require SSE41!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13186, __extension__ __PRETTY_FUNCTION__)); | |||
| 13187 | ||||
| 13188 | // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB. | |||
| 13189 | if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, | |||
| 13190 | Subtarget, DAG)) | |||
| 13191 | return Masked; | |||
| 13192 | ||||
| 13193 | if (Subtarget.hasBWI() && Subtarget.hasVLX()) { | |||
| 13194 | MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8)); | |||
| 13195 | SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); | |||
| 13196 | return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); | |||
| 13197 | } | |||
| 13198 | ||||
| 13199 | // If we have VPTERNLOG, we can use that as a bit blend. | |||
| 13200 | if (Subtarget.hasVLX()) | |||
| 13201 | if (SDValue BitBlend = | |||
| 13202 | lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) | |||
| 13203 | return BitBlend; | |||
| 13204 | ||||
| 13205 | // Scale the blend by the number of bytes per element. | |||
| 13206 | int Scale = VT.getScalarSizeInBits() / 8; | |||
| 13207 | ||||
| 13208 | // This form of blend is always done on bytes. Compute the byte vector | |||
| 13209 | // type. | |||
| 13210 | MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); | |||
| 13211 | ||||
| 13212 | // x86 allows load folding with blendvb from the 2nd source operand. But | |||
| 13213 | // we are still using LLVM select here (see comment below), so that's V1. | |||
| 13214 | // If V2 can be load-folded and V1 cannot be load-folded, then commute to | |||
| 13215 | // allow that load-folding possibility. | |||
| 13216 | if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) { | |||
| 13217 | ShuffleVectorSDNode::commuteMask(Mask); | |||
| 13218 | std::swap(V1, V2); | |||
| 13219 | } | |||
| 13220 | ||||
| 13221 | // Compute the VSELECT mask. Note that VSELECT is really confusing in the | |||
| 13222 | // mix of LLVM's code generator and the x86 backend. We tell the code | |||
| 13223 | // generator that boolean values in the elements of an x86 vector register | |||
| 13224 | // are -1 for true and 0 for false. We then use the LLVM semantics of 'true' | |||
| 13225 | // mapping a select to operand #1, and 'false' mapping to operand #2. The | |||
| 13226 | // reality in x86 is that vector masks (pre-AVX-512) use only the high bit | |||
| 13227 | // of the element (the remaining are ignored) and 0 in that high bit would | |||
| 13228 | // mean operand #1 while 1 in the high bit would mean operand #2. So while | |||
| 13229 | // the LLVM model for boolean values in vector elements gets the relevant | |||
| 13230 | // bit set, it is set backwards and over constrained relative to x86's | |||
| 13231 | // actual model. | |||
| 13232 | SmallVector<SDValue, 32> VSELECTMask; | |||
| 13233 | for (int i = 0, Size = Mask.size(); i < Size; ++i) | |||
| 13234 | for (int j = 0; j < Scale; ++j) | |||
| 13235 | VSELECTMask.push_back( | |||
| 13236 | Mask[i] < 0 ? DAG.getUNDEF(MVT::i8) | |||
| 13237 | : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL, | |||
| 13238 | MVT::i8)); | |||
| 13239 | ||||
| 13240 | V1 = DAG.getBitcast(BlendVT, V1); | |||
| 13241 | V2 = DAG.getBitcast(BlendVT, V2); | |||
| 13242 | return DAG.getBitcast( | |||
| 13243 | VT, | |||
| 13244 | DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), | |||
| 13245 | V1, V2)); | |||
| 13246 | } | |||
| 13247 | case MVT::v16f32: | |||
| 13248 | case MVT::v8f64: | |||
| 13249 | case MVT::v8i64: | |||
| 13250 | case MVT::v16i32: | |||
| 13251 | case MVT::v32i16: | |||
| 13252 | case MVT::v64i8: { | |||
| 13253 | // Attempt to lower to a bitmask if we can. Only if not optimizing for size. | |||
| 13254 | bool OptForSize = DAG.shouldOptForSize(); | |||
| 13255 | if (!OptForSize) { | |||
| 13256 | if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, | |||
| 13257 | Subtarget, DAG)) | |||
| 13258 | return Masked; | |||
| 13259 | } | |||
| 13260 | ||||
| 13261 | // Otherwise load an immediate into a GPR, cast to k-register, and use a | |||
| 13262 | // masked move. | |||
| 13263 | MVT IntegerType = MVT::getIntegerVT(std::max<unsigned>(NumElts, 8)); | |||
| 13264 | SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType); | |||
| 13265 | return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG); | |||
| 13266 | } | |||
| 13267 | default: | |||
| 13268 | llvm_unreachable("Not a supported integer vector type!")::llvm::llvm_unreachable_internal("Not a supported integer vector type!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13268); | |||
| 13269 | } | |||
| 13270 | } | |||
| 13271 | ||||
| 13272 | /// Try to lower as a blend of elements from two inputs followed by | |||
| 13273 | /// a single-input permutation. | |||
| 13274 | /// | |||
| 13275 | /// This matches the pattern where we can blend elements from two inputs and | |||
| 13276 | /// then reduce the shuffle to a single-input permutation. | |||
| 13277 | static SDValue lowerShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT, | |||
| 13278 | SDValue V1, SDValue V2, | |||
| 13279 | ArrayRef<int> Mask, | |||
| 13280 | SelectionDAG &DAG, | |||
| 13281 | bool ImmBlends = false) { | |||
| 13282 | // We build up the blend mask while checking whether a blend is a viable way | |||
| 13283 | // to reduce the shuffle. | |||
| 13284 | SmallVector<int, 32> BlendMask(Mask.size(), -1); | |||
| 13285 | SmallVector<int, 32> PermuteMask(Mask.size(), -1); | |||
| 13286 | ||||
| 13287 | for (int i = 0, Size = Mask.size(); i < Size; ++i) { | |||
| 13288 | if (Mask[i] < 0) | |||
| 13289 | continue; | |||
| 13290 | ||||
| 13291 | assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.")(static_cast <bool> (Mask[i] < Size * 2 && "Shuffle input is out of bounds." ) ? void (0) : __assert_fail ("Mask[i] < Size * 2 && \"Shuffle input is out of bounds.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13291, __extension__ __PRETTY_FUNCTION__)); | |||
| 13292 | ||||
| 13293 | if (BlendMask[Mask[i] % Size] < 0) | |||
| 13294 | BlendMask[Mask[i] % Size] = Mask[i]; | |||
| 13295 | else if (BlendMask[Mask[i] % Size] != Mask[i]) | |||
| 13296 | return SDValue(); // Can't blend in the needed input! | |||
| 13297 | ||||
| 13298 | PermuteMask[i] = Mask[i] % Size; | |||
| 13299 | } | |||
| 13300 | ||||
| 13301 | // If only immediate blends, then bail if the blend mask can't be widened to | |||
| 13302 | // i16. | |||
| 13303 | unsigned EltSize = VT.getScalarSizeInBits(); | |||
| 13304 | if (ImmBlends && EltSize == 8 && !canWidenShuffleElements(BlendMask)) | |||
| 13305 | return SDValue(); | |||
| 13306 | ||||
| 13307 | SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask); | |||
| 13308 | return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask); | |||
| 13309 | } | |||
| 13310 | ||||
| 13311 | /// Try to lower as an unpack of elements from two inputs followed by | |||
| 13312 | /// a single-input permutation. | |||
| 13313 | /// | |||
| 13314 | /// This matches the pattern where we can unpack elements from two inputs and | |||
| 13315 | /// then reduce the shuffle to a single-input (wider) permutation. | |||
| 13316 | static SDValue lowerShuffleAsUNPCKAndPermute(const SDLoc &DL, MVT VT, | |||
| 13317 | SDValue V1, SDValue V2, | |||
| 13318 | ArrayRef<int> Mask, | |||
| 13319 | SelectionDAG &DAG) { | |||
| 13320 | int NumElts = Mask.size(); | |||
| 13321 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 13322 | int NumLaneElts = NumElts / NumLanes; | |||
| 13323 | int NumHalfLaneElts = NumLaneElts / 2; | |||
| 13324 | ||||
| 13325 | bool MatchLo = true, MatchHi = true; | |||
| 13326 | SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; | |||
| 13327 | ||||
| 13328 | // Determine UNPCKL/UNPCKH type and operand order. | |||
| 13329 | for (int Elt = 0; Elt != NumElts; ++Elt) { | |||
| 13330 | int M = Mask[Elt]; | |||
| 13331 | if (M < 0) | |||
| 13332 | continue; | |||
| 13333 | ||||
| 13334 | // Normalize the mask value depending on whether it's V1 or V2. | |||
| 13335 | int NormM = M; | |||
| 13336 | SDValue &Op = Ops[Elt & 1]; | |||
| 13337 | if (M < NumElts && (Op.isUndef() || Op == V1)) | |||
| 13338 | Op = V1; | |||
| 13339 | else if (NumElts <= M && (Op.isUndef() || Op == V2)) { | |||
| 13340 | Op = V2; | |||
| 13341 | NormM -= NumElts; | |||
| 13342 | } else | |||
| 13343 | return SDValue(); | |||
| 13344 | ||||
| 13345 | bool MatchLoAnyLane = false, MatchHiAnyLane = false; | |||
| 13346 | for (int Lane = 0; Lane != NumElts; Lane += NumLaneElts) { | |||
| 13347 | int Lo = Lane, Mid = Lane + NumHalfLaneElts, Hi = Lane + NumLaneElts; | |||
| 13348 | MatchLoAnyLane |= isUndefOrInRange(NormM, Lo, Mid); | |||
| 13349 | MatchHiAnyLane |= isUndefOrInRange(NormM, Mid, Hi); | |||
| 13350 | if (MatchLoAnyLane || MatchHiAnyLane) { | |||
| 13351 | assert((MatchLoAnyLane ^ MatchHiAnyLane) &&(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) && "Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail ("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__ __PRETTY_FUNCTION__)) | |||
| 13352 | "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLoAnyLane ^ MatchHiAnyLane) && "Failed to match UNPCKLO/UNPCKHI") ? void (0) : __assert_fail ("(MatchLoAnyLane ^ MatchHiAnyLane) && \"Failed to match UNPCKLO/UNPCKHI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13352, __extension__ __PRETTY_FUNCTION__)); | |||
| 13353 | break; | |||
| 13354 | } | |||
| 13355 | } | |||
| 13356 | MatchLo &= MatchLoAnyLane; | |||
| 13357 | MatchHi &= MatchHiAnyLane; | |||
| 13358 | if (!MatchLo && !MatchHi) | |||
| 13359 | return SDValue(); | |||
| 13360 | } | |||
| 13361 | assert((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI")(static_cast <bool> ((MatchLo ^ MatchHi) && "Failed to match UNPCKLO/UNPCKHI" ) ? void (0) : __assert_fail ("(MatchLo ^ MatchHi) && \"Failed to match UNPCKLO/UNPCKHI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13361, __extension__ __PRETTY_FUNCTION__)); | |||
| 13362 | ||||
| 13363 | // Element indices have changed after unpacking. Calculate permute mask | |||
| 13364 | // so that they will be put back to the position as dictated by the | |||
| 13365 | // original shuffle mask indices. | |||
| 13366 | SmallVector<int, 32> PermuteMask(NumElts, -1); | |||
| 13367 | for (int Elt = 0; Elt != NumElts; ++Elt) { | |||
| 13368 | int M = Mask[Elt]; | |||
| 13369 | if (M < 0) | |||
| 13370 | continue; | |||
| 13371 | int NormM = M; | |||
| 13372 | if (NumElts <= M) | |||
| 13373 | NormM -= NumElts; | |||
| 13374 | bool IsFirstOp = M < NumElts; | |||
| 13375 | int BaseMaskElt = | |||
| 13376 | NumLaneElts * (NormM / NumLaneElts) + (2 * (NormM % NumHalfLaneElts)); | |||
| 13377 | if ((IsFirstOp && V1 == Ops[0]) || (!IsFirstOp && V2 == Ops[0])) | |||
| 13378 | PermuteMask[Elt] = BaseMaskElt; | |||
| 13379 | else if ((IsFirstOp && V1 == Ops[1]) || (!IsFirstOp && V2 == Ops[1])) | |||
| 13380 | PermuteMask[Elt] = BaseMaskElt + 1; | |||
| 13381 | assert(PermuteMask[Elt] != -1 &&(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask" ) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__ __PRETTY_FUNCTION__)) | |||
| 13382 | "Input mask element is defined but failed to assign permute mask")(static_cast <bool> (PermuteMask[Elt] != -1 && "Input mask element is defined but failed to assign permute mask" ) ? void (0) : __assert_fail ("PermuteMask[Elt] != -1 && \"Input mask element is defined but failed to assign permute mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13382, __extension__ __PRETTY_FUNCTION__)); | |||
| 13383 | } | |||
| 13384 | ||||
| 13385 | unsigned UnpckOp = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH; | |||
| 13386 | SDValue Unpck = DAG.getNode(UnpckOp, DL, VT, Ops); | |||
| 13387 | return DAG.getVectorShuffle(VT, DL, Unpck, DAG.getUNDEF(VT), PermuteMask); | |||
| 13388 | } | |||
| 13389 | ||||
| 13390 | /// Try to lower a shuffle as a permute of the inputs followed by an | |||
| 13391 | /// UNPCK instruction. | |||
| 13392 | /// | |||
| 13393 | /// This specifically targets cases where we end up with alternating between | |||
| 13394 | /// the two inputs, and so can permute them into something that feeds a single | |||
| 13395 | /// UNPCK instruction. Note that this routine only targets integer vectors | |||
| 13396 | /// because for floating point vectors we have a generalized SHUFPS lowering | |||
| 13397 | /// strategy that handles everything that doesn't *exactly* match an unpack, | |||
| 13398 | /// making this clever lowering unnecessary. | |||
| 13399 | static SDValue lowerShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT, | |||
| 13400 | SDValue V1, SDValue V2, | |||
| 13401 | ArrayRef<int> Mask, | |||
| 13402 | const X86Subtarget &Subtarget, | |||
| 13403 | SelectionDAG &DAG) { | |||
| 13404 | int Size = Mask.size(); | |||
| 13405 | assert(Mask.size() >= 2 && "Single element masks are invalid.")(static_cast <bool> (Mask.size() >= 2 && "Single element masks are invalid." ) ? void (0) : __assert_fail ("Mask.size() >= 2 && \"Single element masks are invalid.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13405, __extension__ __PRETTY_FUNCTION__)); | |||
| 13406 | ||||
| 13407 | // This routine only supports 128-bit integer dual input vectors. | |||
| 13408 | if (VT.isFloatingPoint() || !VT.is128BitVector() || V2.isUndef()) | |||
| 13409 | return SDValue(); | |||
| 13410 | ||||
| 13411 | int NumLoInputs = | |||
| 13412 | count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; }); | |||
| 13413 | int NumHiInputs = | |||
| 13414 | count_if(Mask, [Size](int M) { return M % Size >= Size / 2; }); | |||
| 13415 | ||||
| 13416 | bool UnpackLo = NumLoInputs >= NumHiInputs; | |||
| 13417 | ||||
| 13418 | auto TryUnpack = [&](int ScalarSize, int Scale) { | |||
| 13419 | SmallVector<int, 16> V1Mask((unsigned)Size, -1); | |||
| 13420 | SmallVector<int, 16> V2Mask((unsigned)Size, -1); | |||
| 13421 | ||||
| 13422 | for (int i = 0; i < Size; ++i) { | |||
| 13423 | if (Mask[i] < 0) | |||
| 13424 | continue; | |||
| 13425 | ||||
| 13426 | // Each element of the unpack contains Scale elements from this mask. | |||
| 13427 | int UnpackIdx = i / Scale; | |||
| 13428 | ||||
| 13429 | // We only handle the case where V1 feeds the first slots of the unpack. | |||
| 13430 | // We rely on canonicalization to ensure this is the case. | |||
| 13431 | if ((UnpackIdx % 2 == 0) != (Mask[i] < Size)) | |||
| 13432 | return SDValue(); | |||
| 13433 | ||||
| 13434 | // Setup the mask for this input. The indexing is tricky as we have to | |||
| 13435 | // handle the unpack stride. | |||
| 13436 | SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask; | |||
| 13437 | VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] = | |||
| 13438 | Mask[i] % Size; | |||
| 13439 | } | |||
| 13440 | ||||
| 13441 | // If we will have to shuffle both inputs to use the unpack, check whether | |||
| 13442 | // we can just unpack first and shuffle the result. If so, skip this unpack. | |||
| 13443 | if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) && | |||
| 13444 | !isNoopShuffleMask(V2Mask)) | |||
| 13445 | return SDValue(); | |||
| 13446 | ||||
| 13447 | // Shuffle the inputs into place. | |||
| 13448 | V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); | |||
| 13449 | V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); | |||
| 13450 | ||||
| 13451 | // Cast the inputs to the type we will use to unpack them. | |||
| 13452 | MVT UnpackVT = | |||
| 13453 | MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale); | |||
| 13454 | V1 = DAG.getBitcast(UnpackVT, V1); | |||
| 13455 | V2 = DAG.getBitcast(UnpackVT, V2); | |||
| 13456 | ||||
| 13457 | // Unpack the inputs and cast the result back to the desired type. | |||
| 13458 | return DAG.getBitcast( | |||
| 13459 | VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, | |||
| 13460 | UnpackVT, V1, V2)); | |||
| 13461 | }; | |||
| 13462 | ||||
| 13463 | // We try each unpack from the largest to the smallest to try and find one | |||
| 13464 | // that fits this mask. | |||
| 13465 | int OrigScalarSize = VT.getScalarSizeInBits(); | |||
| 13466 | for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2) | |||
| 13467 | if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize)) | |||
| 13468 | return Unpack; | |||
| 13469 | ||||
| 13470 | // If we're shuffling with a zero vector then we're better off not doing | |||
| 13471 | // VECTOR_SHUFFLE(UNPCK()) as we lose track of those zero elements. | |||
| 13472 | if (ISD::isBuildVectorAllZeros(V1.getNode()) || | |||
| 13473 | ISD::isBuildVectorAllZeros(V2.getNode())) | |||
| 13474 | return SDValue(); | |||
| 13475 | ||||
| 13476 | // If none of the unpack-rooted lowerings worked (or were profitable) try an | |||
| 13477 | // initial unpack. | |||
| 13478 | if (NumLoInputs == 0 || NumHiInputs == 0) { | |||
| 13479 | assert((NumLoInputs > 0 || NumHiInputs > 0) &&(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!") ? void ( 0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__ __PRETTY_FUNCTION__)) | |||
| 13480 | "We have to have *some* inputs!")(static_cast <bool> ((NumLoInputs > 0 || NumHiInputs > 0) && "We have to have *some* inputs!") ? void ( 0) : __assert_fail ("(NumLoInputs > 0 || NumHiInputs > 0) && \"We have to have *some* inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13480, __extension__ __PRETTY_FUNCTION__)); | |||
| 13481 | int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0; | |||
| 13482 | ||||
| 13483 | // FIXME: We could consider the total complexity of the permute of each | |||
| 13484 | // possible unpacking. Or at the least we should consider how many | |||
| 13485 | // half-crossings are created. | |||
| 13486 | // FIXME: We could consider commuting the unpacks. | |||
| 13487 | ||||
| 13488 | SmallVector<int, 32> PermMask((unsigned)Size, -1); | |||
| 13489 | for (int i = 0; i < Size; ++i) { | |||
| 13490 | if (Mask[i] < 0) | |||
| 13491 | continue; | |||
| 13492 | ||||
| 13493 | assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!")(static_cast <bool> (Mask[i] % Size >= HalfOffset && "Found input from wrong half!") ? void (0) : __assert_fail ( "Mask[i] % Size >= HalfOffset && \"Found input from wrong half!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13493, __extension__ __PRETTY_FUNCTION__)); | |||
| 13494 | ||||
| 13495 | PermMask[i] = | |||
| 13496 | 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1); | |||
| 13497 | } | |||
| 13498 | return DAG.getVectorShuffle( | |||
| 13499 | VT, DL, | |||
| 13500 | DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL, DL, VT, | |||
| 13501 | V1, V2), | |||
| 13502 | DAG.getUNDEF(VT), PermMask); | |||
| 13503 | } | |||
| 13504 | ||||
| 13505 | return SDValue(); | |||
| 13506 | } | |||
| 13507 | ||||
| 13508 | /// Helper to form a PALIGNR-based rotate+permute, merging 2 inputs and then | |||
| 13509 | /// permuting the elements of the result in place. | |||
| 13510 | static SDValue lowerShuffleAsByteRotateAndPermute( | |||
| 13511 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 13512 | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 13513 | if ((VT.is128BitVector() && !Subtarget.hasSSSE3()) || | |||
| 13514 | (VT.is256BitVector() && !Subtarget.hasAVX2()) || | |||
| 13515 | (VT.is512BitVector() && !Subtarget.hasBWI())) | |||
| 13516 | return SDValue(); | |||
| 13517 | ||||
| 13518 | // We don't currently support lane crossing permutes. | |||
| 13519 | if (is128BitLaneCrossingShuffleMask(VT, Mask)) | |||
| 13520 | return SDValue(); | |||
| 13521 | ||||
| 13522 | int Scale = VT.getScalarSizeInBits() / 8; | |||
| 13523 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 13524 | int NumElts = VT.getVectorNumElements(); | |||
| 13525 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 13526 | ||||
| 13527 | // Determine range of mask elts. | |||
| 13528 | bool Blend1 = true; | |||
| 13529 | bool Blend2 = true; | |||
| 13530 | std::pair<int, int> Range1 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1)); | |||
| 13531 | std::pair<int, int> Range2 = std::make_pair(INT_MAX2147483647, INT_MIN(-2147483647 -1)); | |||
| 13532 | for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { | |||
| 13533 | for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { | |||
| 13534 | int M = Mask[Lane + Elt]; | |||
| 13535 | if (M < 0) | |||
| 13536 | continue; | |||
| 13537 | if (M < NumElts) { | |||
| 13538 | Blend1 &= (M == (Lane + Elt)); | |||
| 13539 | assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask") ? void (0) : __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13539, __extension__ __PRETTY_FUNCTION__)); | |||
| 13540 | M = M % NumEltsPerLane; | |||
| 13541 | Range1.first = std::min(Range1.first, M); | |||
| 13542 | Range1.second = std::max(Range1.second, M); | |||
| 13543 | } else { | |||
| 13544 | M -= NumElts; | |||
| 13545 | Blend2 &= (M == (Lane + Elt)); | |||
| 13546 | assert(Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask")(static_cast <bool> (Lane <= M && M < (Lane + NumEltsPerLane) && "Out of range mask") ? void (0) : __assert_fail ("Lane <= M && M < (Lane + NumEltsPerLane) && \"Out of range mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13546, __extension__ __PRETTY_FUNCTION__)); | |||
| 13547 | M = M % NumEltsPerLane; | |||
| 13548 | Range2.first = std::min(Range2.first, M); | |||
| 13549 | Range2.second = std::max(Range2.second, M); | |||
| 13550 | } | |||
| 13551 | } | |||
| 13552 | } | |||
| 13553 | ||||
| 13554 | // Bail if we don't need both elements. | |||
| 13555 | // TODO - it might be worth doing this for unary shuffles if the permute | |||
| 13556 | // can be widened. | |||
| 13557 | if (!(0 <= Range1.first && Range1.second < NumEltsPerLane) || | |||
| 13558 | !(0 <= Range2.first && Range2.second < NumEltsPerLane)) | |||
| 13559 | return SDValue(); | |||
| 13560 | ||||
| 13561 | if (VT.getSizeInBits() > 128 && (Blend1 || Blend2)) | |||
| 13562 | return SDValue(); | |||
| 13563 | ||||
| 13564 | // Rotate the 2 ops so we can access both ranges, then permute the result. | |||
| 13565 | auto RotateAndPermute = [&](SDValue Lo, SDValue Hi, int RotAmt, int Ofs) { | |||
| 13566 | MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); | |||
| 13567 | SDValue Rotate = DAG.getBitcast( | |||
| 13568 | VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, DAG.getBitcast(ByteVT, Hi), | |||
| 13569 | DAG.getBitcast(ByteVT, Lo), | |||
| 13570 | DAG.getTargetConstant(Scale * RotAmt, DL, MVT::i8))); | |||
| 13571 | SmallVector<int, 64> PermMask(NumElts, SM_SentinelUndef); | |||
| 13572 | for (int Lane = 0; Lane != NumElts; Lane += NumEltsPerLane) { | |||
| 13573 | for (int Elt = 0; Elt != NumEltsPerLane; ++Elt) { | |||
| 13574 | int M = Mask[Lane + Elt]; | |||
| 13575 | if (M < 0) | |||
| 13576 | continue; | |||
| 13577 | if (M < NumElts) | |||
| 13578 | PermMask[Lane + Elt] = Lane + ((M + Ofs - RotAmt) % NumEltsPerLane); | |||
| 13579 | else | |||
| 13580 | PermMask[Lane + Elt] = Lane + ((M - Ofs - RotAmt) % NumEltsPerLane); | |||
| 13581 | } | |||
| 13582 | } | |||
| 13583 | return DAG.getVectorShuffle(VT, DL, Rotate, DAG.getUNDEF(VT), PermMask); | |||
| 13584 | }; | |||
| 13585 | ||||
| 13586 | // Check if the ranges are small enough to rotate from either direction. | |||
| 13587 | if (Range2.second < Range1.first) | |||
| 13588 | return RotateAndPermute(V1, V2, Range1.first, 0); | |||
| 13589 | if (Range1.second < Range2.first) | |||
| 13590 | return RotateAndPermute(V2, V1, Range2.first, NumElts); | |||
| 13591 | return SDValue(); | |||
| 13592 | } | |||
| 13593 | ||||
| 13594 | static bool isBroadcastShuffleMask(ArrayRef<int> Mask) { | |||
| 13595 | return isUndefOrEqual(Mask, 0); | |||
| 13596 | } | |||
| 13597 | ||||
| 13598 | static bool isNoopOrBroadcastShuffleMask(ArrayRef<int> Mask) { | |||
| 13599 | return isNoopShuffleMask(Mask) || isBroadcastShuffleMask(Mask); | |||
| 13600 | } | |||
| 13601 | ||||
| 13602 | /// Check if the Mask consists of the same element repeated multiple times. | |||
| 13603 | static bool isSingleElementRepeatedMask(ArrayRef<int> Mask) { | |||
| 13604 | size_t NumUndefs = 0; | |||
| 13605 | std::optional<int> UniqueElt; | |||
| 13606 | for (int Elt : Mask) { | |||
| 13607 | if (Elt == SM_SentinelUndef) { | |||
| 13608 | NumUndefs++; | |||
| 13609 | continue; | |||
| 13610 | } | |||
| 13611 | if (UniqueElt.has_value() && UniqueElt.value() != Elt) | |||
| 13612 | return false; | |||
| 13613 | UniqueElt = Elt; | |||
| 13614 | } | |||
| 13615 | // Make sure the element is repeated enough times by checking the number of | |||
| 13616 | // undefs is small. | |||
| 13617 | return NumUndefs <= Mask.size() / 2 && UniqueElt.has_value(); | |||
| 13618 | } | |||
| 13619 | ||||
| 13620 | /// Generic routine to decompose a shuffle and blend into independent | |||
| 13621 | /// blends and permutes. | |||
| 13622 | /// | |||
| 13623 | /// This matches the extremely common pattern for handling combined | |||
| 13624 | /// shuffle+blend operations on newer X86 ISAs where we have very fast blend | |||
| 13625 | /// operations. It will try to pick the best arrangement of shuffles and | |||
| 13626 | /// blends. For vXi8/vXi16 shuffles we may use unpack instead of blend. | |||
| 13627 | static SDValue lowerShuffleAsDecomposedShuffleMerge( | |||
| 13628 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 13629 | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 13630 | int NumElts = Mask.size(); | |||
| 13631 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 13632 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 13633 | ||||
| 13634 | // Shuffle the input elements into the desired positions in V1 and V2 and | |||
| 13635 | // unpack/blend them together. | |||
| 13636 | bool IsAlternating = true; | |||
| 13637 | SmallVector<int, 32> V1Mask(NumElts, -1); | |||
| 13638 | SmallVector<int, 32> V2Mask(NumElts, -1); | |||
| 13639 | SmallVector<int, 32> FinalMask(NumElts, -1); | |||
| 13640 | for (int i = 0; i < NumElts; ++i) { | |||
| 13641 | int M = Mask[i]; | |||
| 13642 | if (M >= 0 && M < NumElts) { | |||
| 13643 | V1Mask[i] = M; | |||
| 13644 | FinalMask[i] = i; | |||
| 13645 | IsAlternating &= (i & 1) == 0; | |||
| 13646 | } else if (M >= NumElts) { | |||
| 13647 | V2Mask[i] = M - NumElts; | |||
| 13648 | FinalMask[i] = i + NumElts; | |||
| 13649 | IsAlternating &= (i & 1) == 1; | |||
| 13650 | } | |||
| 13651 | } | |||
| 13652 | ||||
| 13653 | // If we effectively only demand the 0'th element of \p Input, and not only | |||
| 13654 | // as 0'th element, then broadcast said input, | |||
| 13655 | // and change \p InputMask to be a no-op (identity) mask. | |||
| 13656 | auto canonicalizeBroadcastableInput = [DL, VT, &Subtarget, | |||
| 13657 | &DAG](SDValue &Input, | |||
| 13658 | MutableArrayRef<int> InputMask) { | |||
| 13659 | unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); | |||
| 13660 | if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 || | |||
| 13661 | !X86::mayFoldLoad(Input, Subtarget))) | |||
| 13662 | return; | |||
| 13663 | if (isNoopShuffleMask(InputMask)) | |||
| 13664 | return; | |||
| 13665 | assert(isBroadcastShuffleMask(InputMask) &&(static_cast <bool> (isBroadcastShuffleMask(InputMask) && "Expected to demand only the 0'th element.") ? void (0) : __assert_fail ("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__ __PRETTY_FUNCTION__)) | |||
| 13666 | "Expected to demand only the 0'th element.")(static_cast <bool> (isBroadcastShuffleMask(InputMask) && "Expected to demand only the 0'th element.") ? void (0) : __assert_fail ("isBroadcastShuffleMask(InputMask) && \"Expected to demand only the 0'th element.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13666, __extension__ __PRETTY_FUNCTION__)); | |||
| 13667 | Input = DAG.getNode(X86ISD::VBROADCAST, DL, VT, Input); | |||
| 13668 | for (auto I : enumerate(InputMask)) { | |||
| 13669 | int &InputMaskElt = I.value(); | |||
| 13670 | if (InputMaskElt >= 0) | |||
| 13671 | InputMaskElt = I.index(); | |||
| 13672 | } | |||
| 13673 | }; | |||
| 13674 | ||||
| 13675 | // Currently, we may need to produce one shuffle per input, and blend results. | |||
| 13676 | // It is possible that the shuffle for one of the inputs is already a no-op. | |||
| 13677 | // See if we can simplify non-no-op shuffles into broadcasts, | |||
| 13678 | // which we consider to be strictly better than an arbitrary shuffle. | |||
| 13679 | if (isNoopOrBroadcastShuffleMask(V1Mask) && | |||
| 13680 | isNoopOrBroadcastShuffleMask(V2Mask)) { | |||
| 13681 | canonicalizeBroadcastableInput(V1, V1Mask); | |||
| 13682 | canonicalizeBroadcastableInput(V2, V2Mask); | |||
| 13683 | } | |||
| 13684 | ||||
| 13685 | // Try to lower with the simpler initial blend/unpack/rotate strategies unless | |||
| 13686 | // one of the input shuffles would be a no-op. We prefer to shuffle inputs as | |||
| 13687 | // the shuffle may be able to fold with a load or other benefit. However, when | |||
| 13688 | // we'll have to do 2x as many shuffles in order to achieve this, a 2-input | |||
| 13689 | // pre-shuffle first is a better strategy. | |||
| 13690 | if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask)) { | |||
| 13691 | // Only prefer immediate blends to unpack/rotate. | |||
| 13692 | if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, | |||
| 13693 | DAG, true)) | |||
| 13694 | return BlendPerm; | |||
| 13695 | // If either input vector provides only a single element which is repeated | |||
| 13696 | // multiple times, unpacking from both input vectors would generate worse | |||
| 13697 | // code. e.g. for | |||
| 13698 | // t5: v16i8 = vector_shuffle<16,0,16,1,16,2,16,3,16,4,16,5,16,6,16,7> t2, t4 | |||
| 13699 | // it is better to process t4 first to create a vector of t4[0], then unpack | |||
| 13700 | // that vector with t2. | |||
| 13701 | if (!isSingleElementRepeatedMask(V1Mask) && | |||
| 13702 | !isSingleElementRepeatedMask(V2Mask)) | |||
| 13703 | if (SDValue UnpackPerm = | |||
| 13704 | lowerShuffleAsUNPCKAndPermute(DL, VT, V1, V2, Mask, DAG)) | |||
| 13705 | return UnpackPerm; | |||
| 13706 | if (SDValue RotatePerm = lowerShuffleAsByteRotateAndPermute( | |||
| 13707 | DL, VT, V1, V2, Mask, Subtarget, DAG)) | |||
| 13708 | return RotatePerm; | |||
| 13709 | // Unpack/rotate failed - try again with variable blends. | |||
| 13710 | if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, | |||
| 13711 | DAG)) | |||
| 13712 | return BlendPerm; | |||
| 13713 | if (VT.getScalarSizeInBits() >= 32) | |||
| 13714 | if (SDValue PermUnpack = lowerShuffleAsPermuteAndUnpack( | |||
| 13715 | DL, VT, V1, V2, Mask, Subtarget, DAG)) | |||
| 13716 | return PermUnpack; | |||
| 13717 | } | |||
| 13718 | ||||
| 13719 | // If the final mask is an alternating blend of vXi8/vXi16, convert to an | |||
| 13720 | // UNPCKL(SHUFFLE, SHUFFLE) pattern. | |||
| 13721 | // TODO: It doesn't have to be alternating - but each lane mustn't have more | |||
| 13722 | // than half the elements coming from each source. | |||
| 13723 | if (IsAlternating && VT.getScalarSizeInBits() < 32) { | |||
| 13724 | V1Mask.assign(NumElts, -1); | |||
| 13725 | V2Mask.assign(NumElts, -1); | |||
| 13726 | FinalMask.assign(NumElts, -1); | |||
| 13727 | for (int i = 0; i != NumElts; i += NumEltsPerLane) | |||
| 13728 | for (int j = 0; j != NumEltsPerLane; ++j) { | |||
| 13729 | int M = Mask[i + j]; | |||
| 13730 | if (M >= 0 && M < NumElts) { | |||
| 13731 | V1Mask[i + (j / 2)] = M; | |||
| 13732 | FinalMask[i + j] = i + (j / 2); | |||
| 13733 | } else if (M >= NumElts) { | |||
| 13734 | V2Mask[i + (j / 2)] = M - NumElts; | |||
| 13735 | FinalMask[i + j] = i + (j / 2) + NumElts; | |||
| 13736 | } | |||
| 13737 | } | |||
| 13738 | } | |||
| 13739 | ||||
| 13740 | V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask); | |||
| 13741 | V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask); | |||
| 13742 | return DAG.getVectorShuffle(VT, DL, V1, V2, FinalMask); | |||
| 13743 | } | |||
| 13744 | ||||
| 13745 | /// Try to lower a vector shuffle as a bit rotation. | |||
| 13746 | /// | |||
| 13747 | /// Look for a repeated rotation pattern in each sub group. | |||
| 13748 | /// Returns a ISD::ROTL element rotation amount or -1 if failed. | |||
| 13749 | static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) { | |||
| 13750 | int NumElts = Mask.size(); | |||
| 13751 | assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask")(static_cast <bool> ((NumElts % NumSubElts) == 0 && "Illegal shuffle mask") ? void (0) : __assert_fail ("(NumElts % NumSubElts) == 0 && \"Illegal shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13751, __extension__ __PRETTY_FUNCTION__)); | |||
| 13752 | ||||
| 13753 | int RotateAmt = -1; | |||
| 13754 | for (int i = 0; i != NumElts; i += NumSubElts) { | |||
| 13755 | for (int j = 0; j != NumSubElts; ++j) { | |||
| 13756 | int M = Mask[i + j]; | |||
| 13757 | if (M < 0) | |||
| 13758 | continue; | |||
| 13759 | if (!isInRange(M, i, i + NumSubElts)) | |||
| 13760 | return -1; | |||
| 13761 | int Offset = (NumSubElts - (M - (i + j))) % NumSubElts; | |||
| 13762 | if (0 <= RotateAmt && Offset != RotateAmt) | |||
| 13763 | return -1; | |||
| 13764 | RotateAmt = Offset; | |||
| 13765 | } | |||
| 13766 | } | |||
| 13767 | return RotateAmt; | |||
| 13768 | } | |||
| 13769 | ||||
| 13770 | static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits, | |||
| 13771 | const X86Subtarget &Subtarget, | |||
| 13772 | ArrayRef<int> Mask) { | |||
| 13773 | assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13773, __extension__ __PRETTY_FUNCTION__)); | |||
| 13774 | assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers")(static_cast <bool> (EltSizeInBits < 64 && "Can't rotate 64-bit integers" ) ? void (0) : __assert_fail ("EltSizeInBits < 64 && \"Can't rotate 64-bit integers\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13774, __extension__ __PRETTY_FUNCTION__)); | |||
| 13775 | ||||
| 13776 | // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size. | |||
| 13777 | int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2; | |||
| 13778 | int MaxSubElts = 64 / EltSizeInBits; | |||
| 13779 | for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) { | |||
| 13780 | int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts); | |||
| 13781 | if (RotateAmt < 0) | |||
| 13782 | continue; | |||
| 13783 | ||||
| 13784 | int NumElts = Mask.size(); | |||
| 13785 | MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts); | |||
| 13786 | RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts); | |||
| 13787 | return RotateAmt * EltSizeInBits; | |||
| 13788 | } | |||
| 13789 | ||||
| 13790 | return -1; | |||
| 13791 | } | |||
| 13792 | ||||
| 13793 | /// Lower shuffle using X86ISD::VROTLI rotations. | |||
| 13794 | static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 13795 | ArrayRef<int> Mask, | |||
| 13796 | const X86Subtarget &Subtarget, | |||
| 13797 | SelectionDAG &DAG) { | |||
| 13798 | // Only XOP + AVX512 targets have bit rotation instructions. | |||
| 13799 | // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this. | |||
| 13800 | bool IsLegal = | |||
| 13801 | (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512(); | |||
| 13802 | if (!IsLegal && Subtarget.hasSSE3()) | |||
| 13803 | return SDValue(); | |||
| 13804 | ||||
| 13805 | MVT RotateVT; | |||
| 13806 | int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(), | |||
| 13807 | Subtarget, Mask); | |||
| 13808 | if (RotateAmt < 0) | |||
| 13809 | return SDValue(); | |||
| 13810 | ||||
| 13811 | // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL, | |||
| 13812 | // expanded to OR(SRL,SHL), will be more efficient, but if they can | |||
| 13813 | // widen to vXi16 or more then existing lowering should will be better. | |||
| 13814 | if (!IsLegal) { | |||
| 13815 | if ((RotateAmt % 16) == 0) | |||
| 13816 | return SDValue(); | |||
| 13817 | // TODO: Use getTargetVShiftByConstNode. | |||
| 13818 | unsigned ShlAmt = RotateAmt; | |||
| 13819 | unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt; | |||
| 13820 | V1 = DAG.getBitcast(RotateVT, V1); | |||
| 13821 | SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1, | |||
| 13822 | DAG.getTargetConstant(ShlAmt, DL, MVT::i8)); | |||
| 13823 | SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1, | |||
| 13824 | DAG.getTargetConstant(SrlAmt, DL, MVT::i8)); | |||
| 13825 | SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL); | |||
| 13826 | return DAG.getBitcast(VT, Rot); | |||
| 13827 | } | |||
| 13828 | ||||
| 13829 | SDValue Rot = | |||
| 13830 | DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1), | |||
| 13831 | DAG.getTargetConstant(RotateAmt, DL, MVT::i8)); | |||
| 13832 | return DAG.getBitcast(VT, Rot); | |||
| 13833 | } | |||
| 13834 | ||||
| 13835 | /// Try to match a vector shuffle as an element rotation. | |||
| 13836 | /// | |||
| 13837 | /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512. | |||
| 13838 | static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2, | |||
| 13839 | ArrayRef<int> Mask) { | |||
| 13840 | int NumElts = Mask.size(); | |||
| 13841 | ||||
| 13842 | // We need to detect various ways of spelling a rotation: | |||
| 13843 | // [11, 12, 13, 14, 15, 0, 1, 2] | |||
| 13844 | // [-1, 12, 13, 14, -1, -1, 1, -1] | |||
| 13845 | // [-1, -1, -1, -1, -1, -1, 1, 2] | |||
| 13846 | // [ 3, 4, 5, 6, 7, 8, 9, 10] | |||
| 13847 | // [-1, 4, 5, 6, -1, -1, 9, -1] | |||
| 13848 | // [-1, 4, 5, 6, -1, -1, -1, -1] | |||
| 13849 | int Rotation = 0; | |||
| 13850 | SDValue Lo, Hi; | |||
| 13851 | for (int i = 0; i < NumElts; ++i) { | |||
| 13852 | int M = Mask[i]; | |||
| 13853 | assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index." ) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__ __PRETTY_FUNCTION__)) | |||
| 13854 | "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && "Unexpected mask index." ) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) && \"Unexpected mask index.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13854, __extension__ __PRETTY_FUNCTION__)); | |||
| 13855 | if (M < 0) | |||
| 13856 | continue; | |||
| 13857 | ||||
| 13858 | // Determine where a rotated vector would have started. | |||
| 13859 | int StartIdx = i - (M % NumElts); | |||
| 13860 | if (StartIdx == 0) | |||
| 13861 | // The identity rotation isn't interesting, stop. | |||
| 13862 | return -1; | |||
| 13863 | ||||
| 13864 | // If we found the tail of a vector the rotation must be the missing | |||
| 13865 | // front. If we found the head of a vector, it must be how much of the | |||
| 13866 | // head. | |||
| 13867 | int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx; | |||
| 13868 | ||||
| 13869 | if (Rotation == 0) | |||
| 13870 | Rotation = CandidateRotation; | |||
| 13871 | else if (Rotation != CandidateRotation) | |||
| 13872 | // The rotations don't match, so we can't match this mask. | |||
| 13873 | return -1; | |||
| 13874 | ||||
| 13875 | // Compute which value this mask is pointing at. | |||
| 13876 | SDValue MaskV = M < NumElts ? V1 : V2; | |||
| 13877 | ||||
| 13878 | // Compute which of the two target values this index should be assigned | |||
| 13879 | // to. This reflects whether the high elements are remaining or the low | |||
| 13880 | // elements are remaining. | |||
| 13881 | SDValue &TargetV = StartIdx < 0 ? Hi : Lo; | |||
| 13882 | ||||
| 13883 | // Either set up this value if we've not encountered it before, or check | |||
| 13884 | // that it remains consistent. | |||
| 13885 | if (!TargetV) | |||
| 13886 | TargetV = MaskV; | |||
| 13887 | else if (TargetV != MaskV) | |||
| 13888 | // This may be a rotation, but it pulls from the inputs in some | |||
| 13889 | // unsupported interleaving. | |||
| 13890 | return -1; | |||
| 13891 | } | |||
| 13892 | ||||
| 13893 | // Check that we successfully analyzed the mask, and normalize the results. | |||
| 13894 | assert(Rotation != 0 && "Failed to locate a viable rotation!")(static_cast <bool> (Rotation != 0 && "Failed to locate a viable rotation!" ) ? void (0) : __assert_fail ("Rotation != 0 && \"Failed to locate a viable rotation!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13894, __extension__ __PRETTY_FUNCTION__)); | |||
| 13895 | assert((Lo || Hi) && "Failed to find a rotated input vector!")(static_cast <bool> ((Lo || Hi) && "Failed to find a rotated input vector!" ) ? void (0) : __assert_fail ("(Lo || Hi) && \"Failed to find a rotated input vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13895, __extension__ __PRETTY_FUNCTION__)); | |||
| 13896 | if (!Lo) | |||
| 13897 | Lo = Hi; | |||
| 13898 | else if (!Hi) | |||
| 13899 | Hi = Lo; | |||
| 13900 | ||||
| 13901 | V1 = Lo; | |||
| 13902 | V2 = Hi; | |||
| 13903 | ||||
| 13904 | return Rotation; | |||
| 13905 | } | |||
| 13906 | ||||
| 13907 | /// Try to lower a vector shuffle as a byte rotation. | |||
| 13908 | /// | |||
| 13909 | /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary | |||
| 13910 | /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use | |||
| 13911 | /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will | |||
| 13912 | /// try to generically lower a vector shuffle through such an pattern. It | |||
| 13913 | /// does not check for the profitability of lowering either as PALIGNR or | |||
| 13914 | /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form. | |||
| 13915 | /// This matches shuffle vectors that look like: | |||
| 13916 | /// | |||
| 13917 | /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2] | |||
| 13918 | /// | |||
| 13919 | /// Essentially it concatenates V1 and V2, shifts right by some number of | |||
| 13920 | /// elements, and takes the low elements as the result. Note that while this is | |||
| 13921 | /// specified as a *right shift* because x86 is little-endian, it is a *left | |||
| 13922 | /// rotate* of the vector lanes. | |||
| 13923 | static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2, | |||
| 13924 | ArrayRef<int> Mask) { | |||
| 13925 | // Don't accept any shuffles with zero elements. | |||
| 13926 | if (isAnyZero(Mask)) | |||
| 13927 | return -1; | |||
| 13928 | ||||
| 13929 | // PALIGNR works on 128-bit lanes. | |||
| 13930 | SmallVector<int, 16> RepeatedMask; | |||
| 13931 | if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask)) | |||
| 13932 | return -1; | |||
| 13933 | ||||
| 13934 | int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask); | |||
| 13935 | if (Rotation <= 0) | |||
| 13936 | return -1; | |||
| 13937 | ||||
| 13938 | // PALIGNR rotates bytes, so we need to scale the | |||
| 13939 | // rotation based on how many bytes are in the vector lane. | |||
| 13940 | int NumElts = RepeatedMask.size(); | |||
| 13941 | int Scale = 16 / NumElts; | |||
| 13942 | return Rotation * Scale; | |||
| 13943 | } | |||
| 13944 | ||||
| 13945 | static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 13946 | SDValue V2, ArrayRef<int> Mask, | |||
| 13947 | const X86Subtarget &Subtarget, | |||
| 13948 | SelectionDAG &DAG) { | |||
| 13949 | assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13949, __extension__ __PRETTY_FUNCTION__)); | |||
| 13950 | ||||
| 13951 | SDValue Lo = V1, Hi = V2; | |||
| 13952 | int ByteRotation = matchShuffleAsByteRotate(VT, Lo, Hi, Mask); | |||
| 13953 | if (ByteRotation <= 0) | |||
| 13954 | return SDValue(); | |||
| 13955 | ||||
| 13956 | // Cast the inputs to i8 vector of correct length to match PALIGNR or | |||
| 13957 | // PSLLDQ/PSRLDQ. | |||
| 13958 | MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); | |||
| 13959 | Lo = DAG.getBitcast(ByteVT, Lo); | |||
| 13960 | Hi = DAG.getBitcast(ByteVT, Hi); | |||
| 13961 | ||||
| 13962 | // SSSE3 targets can use the palignr instruction. | |||
| 13963 | if (Subtarget.hasSSSE3()) { | |||
| 13964 | assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&(static_cast <bool> ((!VT.is512BitVector() || Subtarget .hasBWI()) && "512-bit PALIGNR requires BWI instructions" ) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__ __PRETTY_FUNCTION__)) | |||
| 13965 | "512-bit PALIGNR requires BWI instructions")(static_cast <bool> ((!VT.is512BitVector() || Subtarget .hasBWI()) && "512-bit PALIGNR requires BWI instructions" ) ? void (0) : __assert_fail ("(!VT.is512BitVector() || Subtarget.hasBWI()) && \"512-bit PALIGNR requires BWI instructions\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13965, __extension__ __PRETTY_FUNCTION__)); | |||
| 13966 | return DAG.getBitcast( | |||
| 13967 | VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi, | |||
| 13968 | DAG.getTargetConstant(ByteRotation, DL, MVT::i8))); | |||
| 13969 | } | |||
| 13970 | ||||
| 13971 | assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__ __PRETTY_FUNCTION__)) | |||
| 13972 | "Rotate-based lowering only supports 128-bit lowering!")(static_cast <bool> (VT.is128BitVector() && "Rotate-based lowering only supports 128-bit lowering!" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Rotate-based lowering only supports 128-bit lowering!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13972, __extension__ __PRETTY_FUNCTION__)); | |||
| 13973 | assert(Mask.size() <= 16 &&(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!" ) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__ __PRETTY_FUNCTION__)) | |||
| 13974 | "Can shuffle at most 16 bytes in a 128-bit vector!")(static_cast <bool> (Mask.size() <= 16 && "Can shuffle at most 16 bytes in a 128-bit vector!" ) ? void (0) : __assert_fail ("Mask.size() <= 16 && \"Can shuffle at most 16 bytes in a 128-bit vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13974, __extension__ __PRETTY_FUNCTION__)); | |||
| 13975 | assert(ByteVT == MVT::v16i8 &&(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!" ) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__ __PRETTY_FUNCTION__)) | |||
| 13976 | "SSE2 rotate lowering only needed for v16i8!")(static_cast <bool> (ByteVT == MVT::v16i8 && "SSE2 rotate lowering only needed for v16i8!" ) ? void (0) : __assert_fail ("ByteVT == MVT::v16i8 && \"SSE2 rotate lowering only needed for v16i8!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 13976, __extension__ __PRETTY_FUNCTION__)); | |||
| 13977 | ||||
| 13978 | // Default SSE2 implementation | |||
| 13979 | int LoByteShift = 16 - ByteRotation; | |||
| 13980 | int HiByteShift = ByteRotation; | |||
| 13981 | ||||
| 13982 | SDValue LoShift = | |||
| 13983 | DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo, | |||
| 13984 | DAG.getTargetConstant(LoByteShift, DL, MVT::i8)); | |||
| 13985 | SDValue HiShift = | |||
| 13986 | DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi, | |||
| 13987 | DAG.getTargetConstant(HiByteShift, DL, MVT::i8)); | |||
| 13988 | return DAG.getBitcast(VT, | |||
| 13989 | DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift)); | |||
| 13990 | } | |||
| 13991 | ||||
| 13992 | /// Try to lower a vector shuffle as a dword/qword rotation. | |||
| 13993 | /// | |||
| 13994 | /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary | |||
| 13995 | /// rotation of the concatenation of two vectors; This routine will | |||
| 13996 | /// try to generically lower a vector shuffle through such an pattern. | |||
| 13997 | /// | |||
| 13998 | /// Essentially it concatenates V1 and V2, shifts right by some number of | |||
| 13999 | /// elements, and takes the low elements as the result. Note that while this is | |||
| 14000 | /// specified as a *right shift* because x86 is little-endian, it is a *left | |||
| 14001 | /// rotate* of the vector lanes. | |||
| 14002 | static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 14003 | SDValue V2, ArrayRef<int> Mask, | |||
| 14004 | const X86Subtarget &Subtarget, | |||
| 14005 | SelectionDAG &DAG) { | |||
| 14006 | assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&(static_cast <bool> ((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__ __PRETTY_FUNCTION__)) | |||
| 14007 | "Only 32-bit and 64-bit elements are supported!")(static_cast <bool> ((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && "Only 32-bit and 64-bit elements are supported!" ) ? void (0) : __assert_fail ("(VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) && \"Only 32-bit and 64-bit elements are supported!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14007, __extension__ __PRETTY_FUNCTION__)); | |||
| 14008 | ||||
| 14009 | // 128/256-bit vectors are only supported with VLX. | |||
| 14010 | assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector () && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors" ) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__ __PRETTY_FUNCTION__)) | |||
| 14011 | && "VLX required for 128/256-bit vectors")(static_cast <bool> ((Subtarget.hasVLX() || (!VT.is128BitVector () && !VT.is256BitVector())) && "VLX required for 128/256-bit vectors" ) ? void (0) : __assert_fail ("(Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector())) && \"VLX required for 128/256-bit vectors\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14011, __extension__ __PRETTY_FUNCTION__)); | |||
| 14012 | ||||
| 14013 | SDValue Lo = V1, Hi = V2; | |||
| 14014 | int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask); | |||
| 14015 | if (Rotation <= 0) | |||
| 14016 | return SDValue(); | |||
| 14017 | ||||
| 14018 | return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi, | |||
| 14019 | DAG.getTargetConstant(Rotation, DL, MVT::i8)); | |||
| 14020 | } | |||
| 14021 | ||||
| 14022 | /// Try to lower a vector shuffle as a byte shift sequence. | |||
| 14023 | static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 14024 | SDValue V2, ArrayRef<int> Mask, | |||
| 14025 | const APInt &Zeroable, | |||
| 14026 | const X86Subtarget &Subtarget, | |||
| 14027 | SelectionDAG &DAG) { | |||
| 14028 | assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!")(static_cast <bool> (!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!") ? void (0) : __assert_fail ("!isNoopShuffleMask(Mask) && \"We shouldn't lower no-op shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14028, __extension__ __PRETTY_FUNCTION__)); | |||
| 14029 | assert(VT.is128BitVector() && "Only 128-bit vectors supported")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors supported" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14029, __extension__ __PRETTY_FUNCTION__)); | |||
| 14030 | ||||
| 14031 | // We need a shuffle that has zeros at one/both ends and a sequential | |||
| 14032 | // shuffle from one source within. | |||
| 14033 | unsigned ZeroLo = Zeroable.countr_one(); | |||
| 14034 | unsigned ZeroHi = Zeroable.countl_one(); | |||
| 14035 | if (!ZeroLo && !ZeroHi) | |||
| 14036 | return SDValue(); | |||
| 14037 | ||||
| 14038 | unsigned NumElts = Mask.size(); | |||
| 14039 | unsigned Len = NumElts - (ZeroLo + ZeroHi); | |||
| 14040 | if (!isSequentialOrUndefInRange(Mask, ZeroLo, Len, Mask[ZeroLo])) | |||
| 14041 | return SDValue(); | |||
| 14042 | ||||
| 14043 | unsigned Scale = VT.getScalarSizeInBits() / 8; | |||
| 14044 | ArrayRef<int> StubMask = Mask.slice(ZeroLo, Len); | |||
| 14045 | if (!isUndefOrInRange(StubMask, 0, NumElts) && | |||
| 14046 | !isUndefOrInRange(StubMask, NumElts, 2 * NumElts)) | |||
| 14047 | return SDValue(); | |||
| 14048 | ||||
| 14049 | SDValue Res = Mask[ZeroLo] < (int)NumElts ? V1 : V2; | |||
| 14050 | Res = DAG.getBitcast(MVT::v16i8, Res); | |||
| 14051 | ||||
| 14052 | // Use VSHLDQ/VSRLDQ ops to zero the ends of a vector and leave an | |||
| 14053 | // inner sequential set of elements, possibly offset: | |||
| 14054 | // 01234567 --> zzzzzz01 --> 1zzzzzzz | |||
| 14055 | // 01234567 --> 4567zzzz --> zzzzz456 | |||
| 14056 | // 01234567 --> z0123456 --> 3456zzzz --> zz3456zz | |||
| 14057 | if (ZeroLo == 0) { | |||
| 14058 | unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); | |||
| 14059 | Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, | |||
| 14060 | DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); | |||
| 14061 | Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, | |||
| 14062 | DAG.getTargetConstant(Scale * ZeroHi, DL, MVT::i8)); | |||
| 14063 | } else if (ZeroHi == 0) { | |||
| 14064 | unsigned Shift = Mask[ZeroLo] % NumElts; | |||
| 14065 | Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, | |||
| 14066 | DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); | |||
| 14067 | Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, | |||
| 14068 | DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); | |||
| 14069 | } else if (!Subtarget.hasSSSE3()) { | |||
| 14070 | // If we don't have PSHUFB then its worth avoiding an AND constant mask | |||
| 14071 | // by performing 3 byte shifts. Shuffle combining can kick in above that. | |||
| 14072 | // TODO: There may be some cases where VSH{LR}DQ+PAND is still better. | |||
| 14073 | unsigned Shift = (NumElts - 1) - (Mask[ZeroLo + Len - 1] % NumElts); | |||
| 14074 | Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, | |||
| 14075 | DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); | |||
| 14076 | Shift += Mask[ZeroLo] % NumElts; | |||
| 14077 | Res = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Res, | |||
| 14078 | DAG.getTargetConstant(Scale * Shift, DL, MVT::i8)); | |||
| 14079 | Res = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Res, | |||
| 14080 | DAG.getTargetConstant(Scale * ZeroLo, DL, MVT::i8)); | |||
| 14081 | } else | |||
| 14082 | return SDValue(); | |||
| 14083 | ||||
| 14084 | return DAG.getBitcast(VT, Res); | |||
| 14085 | } | |||
| 14086 | ||||
| 14087 | /// Try to lower a vector shuffle as a bit shift (shifts in zeros). | |||
| 14088 | /// | |||
| 14089 | /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and | |||
| 14090 | /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function | |||
| 14091 | /// matches elements from one of the input vectors shuffled to the left or | |||
| 14092 | /// right with zeroable elements 'shifted in'. It handles both the strictly | |||
| 14093 | /// bit-wise element shifts and the byte shift across an entire 128-bit double | |||
| 14094 | /// quad word lane. | |||
| 14095 | /// | |||
| 14096 | /// PSHL : (little-endian) left bit shift. | |||
| 14097 | /// [ zz, 0, zz, 2 ] | |||
| 14098 | /// [ -1, 4, zz, -1 ] | |||
| 14099 | /// PSRL : (little-endian) right bit shift. | |||
| 14100 | /// [ 1, zz, 3, zz] | |||
| 14101 | /// [ -1, -1, 7, zz] | |||
| 14102 | /// PSLLDQ : (little-endian) left byte shift | |||
| 14103 | /// [ zz, 0, 1, 2, 3, 4, 5, 6] | |||
| 14104 | /// [ zz, zz, -1, -1, 2, 3, 4, -1] | |||
| 14105 | /// [ zz, zz, zz, zz, zz, zz, -1, 1] | |||
| 14106 | /// PSRLDQ : (little-endian) right byte shift | |||
| 14107 | /// [ 5, 6, 7, zz, zz, zz, zz, zz] | |||
| 14108 | /// [ -1, 5, 6, 7, zz, zz, zz, zz] | |||
| 14109 | /// [ 1, 2, -1, -1, -1, -1, zz, zz] | |||
| 14110 | static int matchShuffleAsShift(MVT &ShiftVT, unsigned &Opcode, | |||
| 14111 | unsigned ScalarSizeInBits, ArrayRef<int> Mask, | |||
| 14112 | int MaskOffset, const APInt &Zeroable, | |||
| 14113 | const X86Subtarget &Subtarget) { | |||
| 14114 | int Size = Mask.size(); | |||
| 14115 | unsigned SizeInBits = Size * ScalarSizeInBits; | |||
| 14116 | ||||
| 14117 | auto CheckZeros = [&](int Shift, int Scale, bool Left) { | |||
| 14118 | for (int i = 0; i < Size; i += Scale) | |||
| 14119 | for (int j = 0; j < Shift; ++j) | |||
| 14120 | if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))]) | |||
| 14121 | return false; | |||
| 14122 | ||||
| 14123 | return true; | |||
| 14124 | }; | |||
| 14125 | ||||
| 14126 | auto MatchShift = [&](int Shift, int Scale, bool Left) { | |||
| 14127 | for (int i = 0; i != Size; i += Scale) { | |||
| 14128 | unsigned Pos = Left ? i + Shift : i; | |||
| 14129 | unsigned Low = Left ? i : i + Shift; | |||
| 14130 | unsigned Len = Scale - Shift; | |||
| 14131 | if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset)) | |||
| 14132 | return -1; | |||
| 14133 | } | |||
| 14134 | ||||
| 14135 | int ShiftEltBits = ScalarSizeInBits * Scale; | |||
| 14136 | bool ByteShift = ShiftEltBits > 64; | |||
| 14137 | Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI) | |||
| 14138 | : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI); | |||
| 14139 | int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1); | |||
| 14140 | ||||
| 14141 | // Normalize the scale for byte shifts to still produce an i64 element | |||
| 14142 | // type. | |||
| 14143 | Scale = ByteShift ? Scale / 2 : Scale; | |||
| 14144 | ||||
| 14145 | // We need to round trip through the appropriate type for the shift. | |||
| 14146 | MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale); | |||
| 14147 | ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8) | |||
| 14148 | : MVT::getVectorVT(ShiftSVT, Size / Scale); | |||
| 14149 | return (int)ShiftAmt; | |||
| 14150 | }; | |||
| 14151 | ||||
| 14152 | // SSE/AVX supports logical shifts up to 64-bit integers - so we can just | |||
| 14153 | // keep doubling the size of the integer elements up to that. We can | |||
| 14154 | // then shift the elements of the integer vector by whole multiples of | |||
| 14155 | // their width within the elements of the larger integer vector. Test each | |||
| 14156 | // multiple to see if we can find a match with the moved element indices | |||
| 14157 | // and that the shifted in elements are all zeroable. | |||
| 14158 | unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128); | |||
| 14159 | for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2) | |||
| 14160 | for (int Shift = 1; Shift != Scale; ++Shift) | |||
| 14161 | for (bool Left : {true, false}) | |||
| 14162 | if (CheckZeros(Shift, Scale, Left)) { | |||
| 14163 | int ShiftAmt = MatchShift(Shift, Scale, Left); | |||
| 14164 | if (0 < ShiftAmt) | |||
| 14165 | return ShiftAmt; | |||
| 14166 | } | |||
| 14167 | ||||
| 14168 | // no match | |||
| 14169 | return -1; | |||
| 14170 | } | |||
| 14171 | ||||
| 14172 | static SDValue lowerShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 14173 | SDValue V2, ArrayRef<int> Mask, | |||
| 14174 | const APInt &Zeroable, | |||
| 14175 | const X86Subtarget &Subtarget, | |||
| 14176 | SelectionDAG &DAG, bool BitwiseOnly) { | |||
| 14177 | int Size = Mask.size(); | |||
| 14178 | assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements () && "Unexpected mask size") ? void (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14178, __extension__ __PRETTY_FUNCTION__)); | |||
| 14179 | ||||
| 14180 | MVT ShiftVT; | |||
| 14181 | SDValue V = V1; | |||
| 14182 | unsigned Opcode; | |||
| 14183 | ||||
| 14184 | // Try to match shuffle against V1 shift. | |||
| 14185 | int ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), | |||
| 14186 | Mask, 0, Zeroable, Subtarget); | |||
| 14187 | ||||
| 14188 | // If V1 failed, try to match shuffle against V2 shift. | |||
| 14189 | if (ShiftAmt < 0) { | |||
| 14190 | ShiftAmt = matchShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(), | |||
| 14191 | Mask, Size, Zeroable, Subtarget); | |||
| 14192 | V = V2; | |||
| 14193 | } | |||
| 14194 | ||||
| 14195 | if (ShiftAmt < 0) | |||
| 14196 | return SDValue(); | |||
| 14197 | ||||
| 14198 | if (BitwiseOnly && (Opcode == X86ISD::VSHLDQ || Opcode == X86ISD::VSRLDQ)) | |||
| 14199 | return SDValue(); | |||
| 14200 | ||||
| 14201 | assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal (ShiftVT) && "Illegal integer vector type") ? void (0 ) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__ __PRETTY_FUNCTION__)) | |||
| 14202 | "Illegal integer vector type")(static_cast <bool> (DAG.getTargetLoweringInfo().isTypeLegal (ShiftVT) && "Illegal integer vector type") ? void (0 ) : __assert_fail ("DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) && \"Illegal integer vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14202, __extension__ __PRETTY_FUNCTION__)); | |||
| 14203 | V = DAG.getBitcast(ShiftVT, V); | |||
| 14204 | V = DAG.getNode(Opcode, DL, ShiftVT, V, | |||
| 14205 | DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); | |||
| 14206 | return DAG.getBitcast(VT, V); | |||
| 14207 | } | |||
| 14208 | ||||
| 14209 | // EXTRQ: Extract Len elements from lower half of source, starting at Idx. | |||
| 14210 | // Remainder of lower half result is zero and upper half is all undef. | |||
| 14211 | static bool matchShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, | |||
| 14212 | ArrayRef<int> Mask, uint64_t &BitLen, | |||
| 14213 | uint64_t &BitIdx, const APInt &Zeroable) { | |||
| 14214 | int Size = Mask.size(); | |||
| 14215 | int HalfSize = Size / 2; | |||
| 14216 | assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements () && "Unexpected mask size") ? void (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14216, __extension__ __PRETTY_FUNCTION__)); | |||
| 14217 | assert(!Zeroable.isAllOnes() && "Fully zeroable shuffle mask")(static_cast <bool> (!Zeroable.isAllOnes() && "Fully zeroable shuffle mask" ) ? void (0) : __assert_fail ("!Zeroable.isAllOnes() && \"Fully zeroable shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14217, __extension__ __PRETTY_FUNCTION__)); | |||
| 14218 | ||||
| 14219 | // Upper half must be undefined. | |||
| 14220 | if (!isUndefUpperHalf(Mask)) | |||
| 14221 | return false; | |||
| 14222 | ||||
| 14223 | // Determine the extraction length from the part of the | |||
| 14224 | // lower half that isn't zeroable. | |||
| 14225 | int Len = HalfSize; | |||
| 14226 | for (; Len > 0; --Len) | |||
| 14227 | if (!Zeroable[Len - 1]) | |||
| 14228 | break; | |||
| 14229 | assert(Len > 0 && "Zeroable shuffle mask")(static_cast <bool> (Len > 0 && "Zeroable shuffle mask" ) ? void (0) : __assert_fail ("Len > 0 && \"Zeroable shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14229, __extension__ __PRETTY_FUNCTION__)); | |||
| 14230 | ||||
| 14231 | // Attempt to match first Len sequential elements from the lower half. | |||
| 14232 | SDValue Src; | |||
| 14233 | int Idx = -1; | |||
| 14234 | for (int i = 0; i != Len; ++i) { | |||
| 14235 | int M = Mask[i]; | |||
| 14236 | if (M == SM_SentinelUndef) | |||
| 14237 | continue; | |||
| 14238 | SDValue &V = (M < Size ? V1 : V2); | |||
| 14239 | M = M % Size; | |||
| 14240 | ||||
| 14241 | // The extracted elements must start at a valid index and all mask | |||
| 14242 | // elements must be in the lower half. | |||
| 14243 | if (i > M || M >= HalfSize) | |||
| 14244 | return false; | |||
| 14245 | ||||
| 14246 | if (Idx < 0 || (Src == V && Idx == (M - i))) { | |||
| 14247 | Src = V; | |||
| 14248 | Idx = M - i; | |||
| 14249 | continue; | |||
| 14250 | } | |||
| 14251 | return false; | |||
| 14252 | } | |||
| 14253 | ||||
| 14254 | if (!Src || Idx < 0) | |||
| 14255 | return false; | |||
| 14256 | ||||
| 14257 | assert((Idx + Len) <= HalfSize && "Illegal extraction mask")(static_cast <bool> ((Idx + Len) <= HalfSize && "Illegal extraction mask") ? void (0) : __assert_fail ("(Idx + Len) <= HalfSize && \"Illegal extraction mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14257, __extension__ __PRETTY_FUNCTION__)); | |||
| 14258 | BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; | |||
| 14259 | BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; | |||
| 14260 | V1 = Src; | |||
| 14261 | return true; | |||
| 14262 | } | |||
| 14263 | ||||
| 14264 | // INSERTQ: Extract lowest Len elements from lower half of second source and | |||
| 14265 | // insert over first source, starting at Idx. | |||
| 14266 | // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } | |||
| 14267 | static bool matchShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, | |||
| 14268 | ArrayRef<int> Mask, uint64_t &BitLen, | |||
| 14269 | uint64_t &BitIdx) { | |||
| 14270 | int Size = Mask.size(); | |||
| 14271 | int HalfSize = Size / 2; | |||
| 14272 | assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size")(static_cast <bool> (Size == (int)VT.getVectorNumElements () && "Unexpected mask size") ? void (0) : __assert_fail ("Size == (int)VT.getVectorNumElements() && \"Unexpected mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14272, __extension__ __PRETTY_FUNCTION__)); | |||
| 14273 | ||||
| 14274 | // Upper half must be undefined. | |||
| 14275 | if (!isUndefUpperHalf(Mask)) | |||
| 14276 | return false; | |||
| 14277 | ||||
| 14278 | for (int Idx = 0; Idx != HalfSize; ++Idx) { | |||
| 14279 | SDValue Base; | |||
| 14280 | ||||
| 14281 | // Attempt to match first source from mask before insertion point. | |||
| 14282 | if (isUndefInRange(Mask, 0, Idx)) { | |||
| 14283 | /* EMPTY */ | |||
| 14284 | } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { | |||
| 14285 | Base = V1; | |||
| 14286 | } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { | |||
| 14287 | Base = V2; | |||
| 14288 | } else { | |||
| 14289 | continue; | |||
| 14290 | } | |||
| 14291 | ||||
| 14292 | // Extend the extraction length looking to match both the insertion of | |||
| 14293 | // the second source and the remaining elements of the first. | |||
| 14294 | for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { | |||
| 14295 | SDValue Insert; | |||
| 14296 | int Len = Hi - Idx; | |||
| 14297 | ||||
| 14298 | // Match insertion. | |||
| 14299 | if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { | |||
| 14300 | Insert = V1; | |||
| 14301 | } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { | |||
| 14302 | Insert = V2; | |||
| 14303 | } else { | |||
| 14304 | continue; | |||
| 14305 | } | |||
| 14306 | ||||
| 14307 | // Match the remaining elements of the lower half. | |||
| 14308 | if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { | |||
| 14309 | /* EMPTY */ | |||
| 14310 | } else if ((!Base || (Base == V1)) && | |||
| 14311 | isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { | |||
| 14312 | Base = V1; | |||
| 14313 | } else if ((!Base || (Base == V2)) && | |||
| 14314 | isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, | |||
| 14315 | Size + Hi)) { | |||
| 14316 | Base = V2; | |||
| 14317 | } else { | |||
| 14318 | continue; | |||
| 14319 | } | |||
| 14320 | ||||
| 14321 | BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; | |||
| 14322 | BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; | |||
| 14323 | V1 = Base; | |||
| 14324 | V2 = Insert; | |||
| 14325 | return true; | |||
| 14326 | } | |||
| 14327 | } | |||
| 14328 | ||||
| 14329 | return false; | |||
| 14330 | } | |||
| 14331 | ||||
| 14332 | /// Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. | |||
| 14333 | static SDValue lowerShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 14334 | SDValue V2, ArrayRef<int> Mask, | |||
| 14335 | const APInt &Zeroable, SelectionDAG &DAG) { | |||
| 14336 | uint64_t BitLen, BitIdx; | |||
| 14337 | if (matchShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) | |||
| 14338 | return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, | |||
| 14339 | DAG.getTargetConstant(BitLen, DL, MVT::i8), | |||
| 14340 | DAG.getTargetConstant(BitIdx, DL, MVT::i8)); | |||
| 14341 | ||||
| 14342 | if (matchShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) | |||
| 14343 | return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), | |||
| 14344 | V2 ? V2 : DAG.getUNDEF(VT), | |||
| 14345 | DAG.getTargetConstant(BitLen, DL, MVT::i8), | |||
| 14346 | DAG.getTargetConstant(BitIdx, DL, MVT::i8)); | |||
| 14347 | ||||
| 14348 | return SDValue(); | |||
| 14349 | } | |||
| 14350 | ||||
| 14351 | /// Lower a vector shuffle as a zero or any extension. | |||
| 14352 | /// | |||
| 14353 | /// Given a specific number of elements, element bit width, and extension | |||
| 14354 | /// stride, produce either a zero or any extension based on the available | |||
| 14355 | /// features of the subtarget. The extended elements are consecutive and | |||
| 14356 | /// begin and can start from an offsetted element index in the input; to | |||
| 14357 | /// avoid excess shuffling the offset must either being in the bottom lane | |||
| 14358 | /// or at the start of a higher lane. All extended elements must be from | |||
| 14359 | /// the same lane. | |||
| 14360 | static SDValue lowerShuffleAsSpecificZeroOrAnyExtend( | |||
| 14361 | const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV, | |||
| 14362 | ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 14363 | assert(Scale > 1 && "Need a scale to extend.")(static_cast <bool> (Scale > 1 && "Need a scale to extend." ) ? void (0) : __assert_fail ("Scale > 1 && \"Need a scale to extend.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14363, __extension__ __PRETTY_FUNCTION__)); | |||
| 14364 | int EltBits = VT.getScalarSizeInBits(); | |||
| 14365 | int NumElements = VT.getVectorNumElements(); | |||
| 14366 | int NumEltsPerLane = 128 / EltBits; | |||
| 14367 | int OffsetLane = Offset / NumEltsPerLane; | |||
| 14368 | assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended." ) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__ __PRETTY_FUNCTION__)) | |||
| 14369 | "Only 8, 16, and 32 bit elements can be extended.")(static_cast <bool> ((EltBits == 8 || EltBits == 16 || EltBits == 32) && "Only 8, 16, and 32 bit elements can be extended." ) ? void (0) : __assert_fail ("(EltBits == 8 || EltBits == 16 || EltBits == 32) && \"Only 8, 16, and 32 bit elements can be extended.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14369, __extension__ __PRETTY_FUNCTION__)); | |||
| 14370 | assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.")(static_cast <bool> (Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.") ? void (0) : __assert_fail ("Scale * EltBits <= 64 && \"Cannot zero extend past 64 bits.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14370, __extension__ __PRETTY_FUNCTION__)); | |||
| 14371 | assert(0 <= Offset && "Extension offset must be positive.")(static_cast <bool> (0 <= Offset && "Extension offset must be positive." ) ? void (0) : __assert_fail ("0 <= Offset && \"Extension offset must be positive.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14371, __extension__ __PRETTY_FUNCTION__)); | |||
| 14372 | assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&(static_cast <bool> ((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane." ) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__ __PRETTY_FUNCTION__)) | |||
| 14373 | "Extension offset must be in the first lane or start an upper lane.")(static_cast <bool> ((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && "Extension offset must be in the first lane or start an upper lane." ) ? void (0) : __assert_fail ("(Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) && \"Extension offset must be in the first lane or start an upper lane.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14373, __extension__ __PRETTY_FUNCTION__)); | |||
| 14374 | ||||
| 14375 | // Check that an index is in same lane as the base offset. | |||
| 14376 | auto SafeOffset = [&](int Idx) { | |||
| 14377 | return OffsetLane == (Idx / NumEltsPerLane); | |||
| 14378 | }; | |||
| 14379 | ||||
| 14380 | // Shift along an input so that the offset base moves to the first element. | |||
| 14381 | auto ShuffleOffset = [&](SDValue V) { | |||
| 14382 | if (!Offset) | |||
| 14383 | return V; | |||
| 14384 | ||||
| 14385 | SmallVector<int, 8> ShMask((unsigned)NumElements, -1); | |||
| 14386 | for (int i = 0; i * Scale < NumElements; ++i) { | |||
| 14387 | int SrcIdx = i + Offset; | |||
| 14388 | ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1; | |||
| 14389 | } | |||
| 14390 | return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask); | |||
| 14391 | }; | |||
| 14392 | ||||
| 14393 | // Found a valid a/zext mask! Try various lowering strategies based on the | |||
| 14394 | // input type and available ISA extensions. | |||
| 14395 | if (Subtarget.hasSSE41()) { | |||
| 14396 | // Not worth offsetting 128-bit vectors if scale == 2, a pattern using | |||
| 14397 | // PUNPCK will catch this in a later shuffle match. | |||
| 14398 | if (Offset && Scale == 2 && VT.is128BitVector()) | |||
| 14399 | return SDValue(); | |||
| 14400 | MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale), | |||
| 14401 | NumElements / Scale); | |||
| 14402 | InputV = DAG.getBitcast(VT, InputV); | |||
| 14403 | InputV = ShuffleOffset(InputV); | |||
| 14404 | InputV = getEXTEND_VECTOR_INREG(AnyExt ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND, | |||
| 14405 | DL, ExtVT, InputV, DAG); | |||
| 14406 | return DAG.getBitcast(VT, InputV); | |||
| 14407 | } | |||
| 14408 | ||||
| 14409 | assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vectors can be extended." ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vectors can be extended.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14409, __extension__ __PRETTY_FUNCTION__)); | |||
| 14410 | InputV = DAG.getBitcast(VT, InputV); | |||
| 14411 | ||||
| 14412 | // For any extends we can cheat for larger element sizes and use shuffle | |||
| 14413 | // instructions that can fold with a load and/or copy. | |||
| 14414 | if (AnyExt && EltBits == 32) { | |||
| 14415 | int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1, | |||
| 14416 | -1}; | |||
| 14417 | return DAG.getBitcast( | |||
| 14418 | VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, | |||
| 14419 | DAG.getBitcast(MVT::v4i32, InputV), | |||
| 14420 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); | |||
| 14421 | } | |||
| 14422 | if (AnyExt && EltBits == 16 && Scale > 2) { | |||
| 14423 | int PSHUFDMask[4] = {Offset / 2, -1, | |||
| 14424 | SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1}; | |||
| 14425 | InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, | |||
| 14426 | DAG.getBitcast(MVT::v4i32, InputV), | |||
| 14427 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); | |||
| 14428 | int PSHUFWMask[4] = {1, -1, -1, -1}; | |||
| 14429 | unsigned OddEvenOp = (Offset & 1) ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; | |||
| 14430 | return DAG.getBitcast( | |||
| 14431 | VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16, | |||
| 14432 | DAG.getBitcast(MVT::v8i16, InputV), | |||
| 14433 | getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG))); | |||
| 14434 | } | |||
| 14435 | ||||
| 14436 | // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes | |||
| 14437 | // to 64-bits. | |||
| 14438 | if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) { | |||
| 14439 | assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!")(static_cast <bool> (NumElements == (int)Mask.size() && "Unexpected shuffle mask size!") ? void (0) : __assert_fail ( "NumElements == (int)Mask.size() && \"Unexpected shuffle mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14439, __extension__ __PRETTY_FUNCTION__)); | |||
| 14440 | assert(VT.is128BitVector() && "Unexpected vector width!")(static_cast <bool> (VT.is128BitVector() && "Unexpected vector width!" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Unexpected vector width!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14440, __extension__ __PRETTY_FUNCTION__)); | |||
| 14441 | ||||
| 14442 | int LoIdx = Offset * EltBits; | |||
| 14443 | SDValue Lo = DAG.getBitcast( | |||
| 14444 | MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, | |||
| 14445 | DAG.getTargetConstant(EltBits, DL, MVT::i8), | |||
| 14446 | DAG.getTargetConstant(LoIdx, DL, MVT::i8))); | |||
| 14447 | ||||
| 14448 | if (isUndefUpperHalf(Mask) || !SafeOffset(Offset + 1)) | |||
| 14449 | return DAG.getBitcast(VT, Lo); | |||
| 14450 | ||||
| 14451 | int HiIdx = (Offset + 1) * EltBits; | |||
| 14452 | SDValue Hi = DAG.getBitcast( | |||
| 14453 | MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV, | |||
| 14454 | DAG.getTargetConstant(EltBits, DL, MVT::i8), | |||
| 14455 | DAG.getTargetConstant(HiIdx, DL, MVT::i8))); | |||
| 14456 | return DAG.getBitcast(VT, | |||
| 14457 | DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi)); | |||
| 14458 | } | |||
| 14459 | ||||
| 14460 | // If this would require more than 2 unpack instructions to expand, use | |||
| 14461 | // pshufb when available. We can only use more than 2 unpack instructions | |||
| 14462 | // when zero extending i8 elements which also makes it easier to use pshufb. | |||
| 14463 | if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) { | |||
| 14464 | assert(NumElements == 16 && "Unexpected byte vector width!")(static_cast <bool> (NumElements == 16 && "Unexpected byte vector width!" ) ? void (0) : __assert_fail ("NumElements == 16 && \"Unexpected byte vector width!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14464, __extension__ __PRETTY_FUNCTION__)); | |||
| 14465 | SDValue PSHUFBMask[16]; | |||
| 14466 | for (int i = 0; i < 16; ++i) { | |||
| 14467 | int Idx = Offset + (i / Scale); | |||
| 14468 | if ((i % Scale == 0 && SafeOffset(Idx))) { | |||
| 14469 | PSHUFBMask[i] = DAG.getConstant(Idx, DL, MVT::i8); | |||
| 14470 | continue; | |||
| 14471 | } | |||
| 14472 | PSHUFBMask[i] = | |||
| 14473 | AnyExt ? DAG.getUNDEF(MVT::i8) : DAG.getConstant(0x80, DL, MVT::i8); | |||
| 14474 | } | |||
| 14475 | InputV = DAG.getBitcast(MVT::v16i8, InputV); | |||
| 14476 | return DAG.getBitcast( | |||
| 14477 | VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV, | |||
| 14478 | DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask))); | |||
| 14479 | } | |||
| 14480 | ||||
| 14481 | // If we are extending from an offset, ensure we start on a boundary that | |||
| 14482 | // we can unpack from. | |||
| 14483 | int AlignToUnpack = Offset % (NumElements / Scale); | |||
| 14484 | if (AlignToUnpack) { | |||
| 14485 | SmallVector<int, 8> ShMask((unsigned)NumElements, -1); | |||
| 14486 | for (int i = AlignToUnpack; i < NumElements; ++i) | |||
| 14487 | ShMask[i - AlignToUnpack] = i; | |||
| 14488 | InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask); | |||
| 14489 | Offset -= AlignToUnpack; | |||
| 14490 | } | |||
| 14491 | ||||
| 14492 | // Otherwise emit a sequence of unpacks. | |||
| 14493 | do { | |||
| 14494 | unsigned UnpackLoHi = X86ISD::UNPCKL; | |||
| 14495 | if (Offset >= (NumElements / 2)) { | |||
| 14496 | UnpackLoHi = X86ISD::UNPCKH; | |||
| 14497 | Offset -= (NumElements / 2); | |||
| 14498 | } | |||
| 14499 | ||||
| 14500 | MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements); | |||
| 14501 | SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT) | |||
| 14502 | : getZeroVector(InputVT, Subtarget, DAG, DL); | |||
| 14503 | InputV = DAG.getBitcast(InputVT, InputV); | |||
| 14504 | InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext); | |||
| 14505 | Scale /= 2; | |||
| 14506 | EltBits *= 2; | |||
| 14507 | NumElements /= 2; | |||
| 14508 | } while (Scale > 1); | |||
| 14509 | return DAG.getBitcast(VT, InputV); | |||
| 14510 | } | |||
| 14511 | ||||
| 14512 | /// Try to lower a vector shuffle as a zero extension on any microarch. | |||
| 14513 | /// | |||
| 14514 | /// This routine will try to do everything in its power to cleverly lower | |||
| 14515 | /// a shuffle which happens to match the pattern of a zero extend. It doesn't | |||
| 14516 | /// check for the profitability of this lowering, it tries to aggressively | |||
| 14517 | /// match this pattern. It will use all of the micro-architectural details it | |||
| 14518 | /// can to emit an efficient lowering. It handles both blends with all-zero | |||
| 14519 | /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to | |||
| 14520 | /// masking out later). | |||
| 14521 | /// | |||
| 14522 | /// The reason we have dedicated lowering for zext-style shuffles is that they | |||
| 14523 | /// are both incredibly common and often quite performance sensitive. | |||
| 14524 | static SDValue lowerShuffleAsZeroOrAnyExtend( | |||
| 14525 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 14526 | const APInt &Zeroable, const X86Subtarget &Subtarget, | |||
| 14527 | SelectionDAG &DAG) { | |||
| 14528 | int Bits = VT.getSizeInBits(); | |||
| 14529 | int NumLanes = Bits / 128; | |||
| 14530 | int NumElements = VT.getVectorNumElements(); | |||
| 14531 | int NumEltsPerLane = NumElements / NumLanes; | |||
| 14532 | assert(VT.getScalarSizeInBits() <= 32 &&(static_cast <bool> (VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__ __PRETTY_FUNCTION__)) | |||
| 14533 | "Exceeds 32-bit integer zero extension limit")(static_cast <bool> (VT.getScalarSizeInBits() <= 32 && "Exceeds 32-bit integer zero extension limit") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() <= 32 && \"Exceeds 32-bit integer zero extension limit\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14533, __extension__ __PRETTY_FUNCTION__)); | |||
| 14534 | assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size")(static_cast <bool> ((int)Mask.size() == NumElements && "Unexpected shuffle mask size") ? void (0) : __assert_fail ( "(int)Mask.size() == NumElements && \"Unexpected shuffle mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14534, __extension__ __PRETTY_FUNCTION__)); | |||
| 14535 | ||||
| 14536 | // Define a helper function to check a particular ext-scale and lower to it if | |||
| 14537 | // valid. | |||
| 14538 | auto Lower = [&](int Scale) -> SDValue { | |||
| 14539 | SDValue InputV; | |||
| 14540 | bool AnyExt = true; | |||
| 14541 | int Offset = 0; | |||
| 14542 | int Matches = 0; | |||
| 14543 | for (int i = 0; i < NumElements; ++i) { | |||
| 14544 | int M = Mask[i]; | |||
| 14545 | if (M < 0) | |||
| 14546 | continue; // Valid anywhere but doesn't tell us anything. | |||
| 14547 | if (i % Scale != 0) { | |||
| 14548 | // Each of the extended elements need to be zeroable. | |||
| 14549 | if (!Zeroable[i]) | |||
| 14550 | return SDValue(); | |||
| 14551 | ||||
| 14552 | // We no longer are in the anyext case. | |||
| 14553 | AnyExt = false; | |||
| 14554 | continue; | |||
| 14555 | } | |||
| 14556 | ||||
| 14557 | // Each of the base elements needs to be consecutive indices into the | |||
| 14558 | // same input vector. | |||
| 14559 | SDValue V = M < NumElements ? V1 : V2; | |||
| 14560 | M = M % NumElements; | |||
| 14561 | if (!InputV) { | |||
| 14562 | InputV = V; | |||
| 14563 | Offset = M - (i / Scale); | |||
| 14564 | } else if (InputV != V) | |||
| 14565 | return SDValue(); // Flip-flopping inputs. | |||
| 14566 | ||||
| 14567 | // Offset must start in the lowest 128-bit lane or at the start of an | |||
| 14568 | // upper lane. | |||
| 14569 | // FIXME: Is it ever worth allowing a negative base offset? | |||
| 14570 | if (!((0 <= Offset && Offset < NumEltsPerLane) || | |||
| 14571 | (Offset % NumEltsPerLane) == 0)) | |||
| 14572 | return SDValue(); | |||
| 14573 | ||||
| 14574 | // If we are offsetting, all referenced entries must come from the same | |||
| 14575 | // lane. | |||
| 14576 | if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane)) | |||
| 14577 | return SDValue(); | |||
| 14578 | ||||
| 14579 | if ((M % NumElements) != (Offset + (i / Scale))) | |||
| 14580 | return SDValue(); // Non-consecutive strided elements. | |||
| 14581 | Matches++; | |||
| 14582 | } | |||
| 14583 | ||||
| 14584 | // If we fail to find an input, we have a zero-shuffle which should always | |||
| 14585 | // have already been handled. | |||
| 14586 | // FIXME: Maybe handle this here in case during blending we end up with one? | |||
| 14587 | if (!InputV) | |||
| 14588 | return SDValue(); | |||
| 14589 | ||||
| 14590 | // If we are offsetting, don't extend if we only match a single input, we | |||
| 14591 | // can always do better by using a basic PSHUF or PUNPCK. | |||
| 14592 | if (Offset != 0 && Matches < 2) | |||
| 14593 | return SDValue(); | |||
| 14594 | ||||
| 14595 | return lowerShuffleAsSpecificZeroOrAnyExtend(DL, VT, Scale, Offset, AnyExt, | |||
| 14596 | InputV, Mask, Subtarget, DAG); | |||
| 14597 | }; | |||
| 14598 | ||||
| 14599 | // The widest scale possible for extending is to a 64-bit integer. | |||
| 14600 | assert(Bits % 64 == 0 &&(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!" ) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__ __PRETTY_FUNCTION__)) | |||
| 14601 | "The number of bits in a vector must be divisible by 64 on x86!")(static_cast <bool> (Bits % 64 == 0 && "The number of bits in a vector must be divisible by 64 on x86!" ) ? void (0) : __assert_fail ("Bits % 64 == 0 && \"The number of bits in a vector must be divisible by 64 on x86!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14601, __extension__ __PRETTY_FUNCTION__)); | |||
| 14602 | int NumExtElements = Bits / 64; | |||
| 14603 | ||||
| 14604 | // Each iteration, try extending the elements half as much, but into twice as | |||
| 14605 | // many elements. | |||
| 14606 | for (; NumExtElements < NumElements; NumExtElements *= 2) { | |||
| 14607 | assert(NumElements % NumExtElements == 0 &&(static_cast <bool> (NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size." ) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__ __PRETTY_FUNCTION__)) | |||
| 14608 | "The input vector size must be divisible by the extended size.")(static_cast <bool> (NumElements % NumExtElements == 0 && "The input vector size must be divisible by the extended size." ) ? void (0) : __assert_fail ("NumElements % NumExtElements == 0 && \"The input vector size must be divisible by the extended size.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14608, __extension__ __PRETTY_FUNCTION__)); | |||
| 14609 | if (SDValue V = Lower(NumElements / NumExtElements)) | |||
| 14610 | return V; | |||
| 14611 | } | |||
| 14612 | ||||
| 14613 | // General extends failed, but 128-bit vectors may be able to use MOVQ. | |||
| 14614 | if (Bits != 128) | |||
| 14615 | return SDValue(); | |||
| 14616 | ||||
| 14617 | // Returns one of the source operands if the shuffle can be reduced to a | |||
| 14618 | // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits. | |||
| 14619 | auto CanZExtLowHalf = [&]() { | |||
| 14620 | for (int i = NumElements / 2; i != NumElements; ++i) | |||
| 14621 | if (!Zeroable[i]) | |||
| 14622 | return SDValue(); | |||
| 14623 | if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0)) | |||
| 14624 | return V1; | |||
| 14625 | if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements)) | |||
| 14626 | return V2; | |||
| 14627 | return SDValue(); | |||
| 14628 | }; | |||
| 14629 | ||||
| 14630 | if (SDValue V = CanZExtLowHalf()) { | |||
| 14631 | V = DAG.getBitcast(MVT::v2i64, V); | |||
| 14632 | V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V); | |||
| 14633 | return DAG.getBitcast(VT, V); | |||
| 14634 | } | |||
| 14635 | ||||
| 14636 | // No viable ext lowering found. | |||
| 14637 | return SDValue(); | |||
| 14638 | } | |||
| 14639 | ||||
| 14640 | /// Try to get a scalar value for a specific element of a vector. | |||
| 14641 | /// | |||
| 14642 | /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar. | |||
| 14643 | static SDValue getScalarValueForVectorElement(SDValue V, int Idx, | |||
| 14644 | SelectionDAG &DAG) { | |||
| 14645 | MVT VT = V.getSimpleValueType(); | |||
| 14646 | MVT EltVT = VT.getVectorElementType(); | |||
| 14647 | V = peekThroughBitcasts(V); | |||
| 14648 | ||||
| 14649 | // If the bitcasts shift the element size, we can't extract an equivalent | |||
| 14650 | // element from it. | |||
| 14651 | MVT NewVT = V.getSimpleValueType(); | |||
| 14652 | if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) | |||
| 14653 | return SDValue(); | |||
| 14654 | ||||
| 14655 | if (V.getOpcode() == ISD::BUILD_VECTOR || | |||
| 14656 | (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) { | |||
| 14657 | // Ensure the scalar operand is the same size as the destination. | |||
| 14658 | // FIXME: Add support for scalar truncation where possible. | |||
| 14659 | SDValue S = V.getOperand(Idx); | |||
| 14660 | if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits()) | |||
| 14661 | return DAG.getBitcast(EltVT, S); | |||
| 14662 | } | |||
| 14663 | ||||
| 14664 | return SDValue(); | |||
| 14665 | } | |||
| 14666 | ||||
| 14667 | /// Helper to test for a load that can be folded with x86 shuffles. | |||
| 14668 | /// | |||
| 14669 | /// This is particularly important because the set of instructions varies | |||
| 14670 | /// significantly based on whether the operand is a load or not. | |||
| 14671 | static bool isShuffleFoldableLoad(SDValue V) { | |||
| 14672 | return V->hasOneUse() && | |||
| 14673 | ISD::isNON_EXTLoad(peekThroughOneUseBitcasts(V).getNode()); | |||
| 14674 | } | |||
| 14675 | ||||
| 14676 | template<typename T> | |||
| 14677 | static bool isSoftFP16(T VT, const X86Subtarget &Subtarget) { | |||
| 14678 | return VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16(); | |||
| 14679 | } | |||
| 14680 | ||||
| 14681 | template<typename T> | |||
| 14682 | bool X86TargetLowering::isSoftFP16(T VT) const { | |||
| 14683 | return ::isSoftFP16(VT, Subtarget); | |||
| 14684 | } | |||
| 14685 | ||||
| 14686 | /// Try to lower insertion of a single element into a zero vector. | |||
| 14687 | /// | |||
| 14688 | /// This is a common pattern that we have especially efficient patterns to lower | |||
| 14689 | /// across all subtarget feature sets. | |||
| 14690 | static SDValue lowerShuffleAsElementInsertion( | |||
| 14691 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 14692 | const APInt &Zeroable, const X86Subtarget &Subtarget, | |||
| 14693 | SelectionDAG &DAG) { | |||
| 14694 | MVT ExtVT = VT; | |||
| 14695 | MVT EltVT = VT.getVectorElementType(); | |||
| 14696 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 14697 | unsigned EltBits = VT.getScalarSizeInBits(); | |||
| 14698 | ||||
| 14699 | if (isSoftFP16(EltVT, Subtarget)) | |||
| 14700 | return SDValue(); | |||
| 14701 | ||||
| 14702 | int V2Index = | |||
| 14703 | find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) - | |||
| 14704 | Mask.begin(); | |||
| 14705 | bool IsV1Constant = getTargetConstantFromNode(V1) != nullptr; | |||
| 14706 | bool IsV1Zeroable = true; | |||
| 14707 | for (int i = 0, Size = Mask.size(); i < Size; ++i) | |||
| 14708 | if (i != V2Index && !Zeroable[i]) { | |||
| 14709 | IsV1Zeroable = false; | |||
| 14710 | break; | |||
| 14711 | } | |||
| 14712 | ||||
| 14713 | // Bail if a non-zero V1 isn't used in place. | |||
| 14714 | if (!IsV1Zeroable) { | |||
| 14715 | SmallVector<int, 8> V1Mask(Mask); | |||
| 14716 | V1Mask[V2Index] = -1; | |||
| 14717 | if (!isNoopShuffleMask(V1Mask)) | |||
| 14718 | return SDValue(); | |||
| 14719 | } | |||
| 14720 | ||||
| 14721 | // Check for a single input from a SCALAR_TO_VECTOR node. | |||
| 14722 | // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and | |||
| 14723 | // all the smarts here sunk into that routine. However, the current | |||
| 14724 | // lowering of BUILD_VECTOR makes that nearly impossible until the old | |||
| 14725 | // vector shuffle lowering is dead. | |||
| 14726 | SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(), | |||
| 14727 | DAG); | |||
| 14728 | if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) { | |||
| 14729 | // We need to zext the scalar if it is smaller than an i32. | |||
| 14730 | V2S = DAG.getBitcast(EltVT, V2S); | |||
| 14731 | if (EltVT == MVT::i8 || (EltVT == MVT::i16 && !Subtarget.hasFP16())) { | |||
| 14732 | // Using zext to expand a narrow element won't work for non-zero | |||
| 14733 | // insertions. But we can use a masked constant vector if we're | |||
| 14734 | // inserting V2 into the bottom of V1. | |||
| 14735 | if (!IsV1Zeroable && !(IsV1Constant && V2Index == 0)) | |||
| 14736 | return SDValue(); | |||
| 14737 | ||||
| 14738 | // Zero-extend directly to i32. | |||
| 14739 | ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); | |||
| 14740 | V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); | |||
| 14741 | ||||
| 14742 | // If we're inserting into a constant, mask off the inserted index | |||
| 14743 | // and OR with the zero-extended scalar. | |||
| 14744 | if (!IsV1Zeroable) { | |||
| 14745 | SmallVector<APInt> Bits(NumElts, APInt::getAllOnes(EltBits)); | |||
| 14746 | Bits[V2Index] = APInt::getZero(EltBits); | |||
| 14747 | SDValue BitMask = getConstVector(Bits, VT, DAG, DL); | |||
| 14748 | V1 = DAG.getNode(ISD::AND, DL, VT, V1, BitMask); | |||
| 14749 | V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); | |||
| 14750 | V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2)); | |||
| 14751 | return DAG.getNode(ISD::OR, DL, VT, V1, V2); | |||
| 14752 | } | |||
| 14753 | } | |||
| 14754 | V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); | |||
| 14755 | } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 || | |||
| 14756 | EltVT == MVT::i16) { | |||
| 14757 | // Either not inserting from the low element of the input or the input | |||
| 14758 | // element size is too small to use VZEXT_MOVL to clear the high bits. | |||
| 14759 | return SDValue(); | |||
| 14760 | } | |||
| 14761 | ||||
| 14762 | if (!IsV1Zeroable) { | |||
| 14763 | // If V1 can't be treated as a zero vector we have fewer options to lower | |||
| 14764 | // this. We can't support integer vectors or non-zero targets cheaply. | |||
| 14765 | assert(VT == ExtVT && "Cannot change extended type when non-zeroable!")(static_cast <bool> (VT == ExtVT && "Cannot change extended type when non-zeroable!" ) ? void (0) : __assert_fail ("VT == ExtVT && \"Cannot change extended type when non-zeroable!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14765, __extension__ __PRETTY_FUNCTION__)); | |||
| 14766 | if (!VT.isFloatingPoint() || V2Index != 0) | |||
| 14767 | return SDValue(); | |||
| 14768 | if (!VT.is128BitVector()) | |||
| 14769 | return SDValue(); | |||
| 14770 | ||||
| 14771 | // Otherwise, use MOVSD, MOVSS or MOVSH. | |||
| 14772 | unsigned MovOpc = 0; | |||
| 14773 | if (EltVT == MVT::f16) | |||
| 14774 | MovOpc = X86ISD::MOVSH; | |||
| 14775 | else if (EltVT == MVT::f32) | |||
| 14776 | MovOpc = X86ISD::MOVSS; | |||
| 14777 | else if (EltVT == MVT::f64) | |||
| 14778 | MovOpc = X86ISD::MOVSD; | |||
| 14779 | else | |||
| 14780 | llvm_unreachable("Unsupported floating point element type to handle!")::llvm::llvm_unreachable_internal("Unsupported floating point element type to handle!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14780); | |||
| 14781 | return DAG.getNode(MovOpc, DL, ExtVT, V1, V2); | |||
| 14782 | } | |||
| 14783 | ||||
| 14784 | // This lowering only works for the low element with floating point vectors. | |||
| 14785 | if (VT.isFloatingPoint() && V2Index != 0) | |||
| 14786 | return SDValue(); | |||
| 14787 | ||||
| 14788 | V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2); | |||
| 14789 | if (ExtVT != VT) | |||
| 14790 | V2 = DAG.getBitcast(VT, V2); | |||
| 14791 | ||||
| 14792 | if (V2Index != 0) { | |||
| 14793 | // If we have 4 or fewer lanes we can cheaply shuffle the element into | |||
| 14794 | // the desired position. Otherwise it is more efficient to do a vector | |||
| 14795 | // shift left. We know that we can do a vector shift left because all | |||
| 14796 | // the inputs are zero. | |||
| 14797 | if (VT.isFloatingPoint() || NumElts <= 4) { | |||
| 14798 | SmallVector<int, 4> V2Shuffle(Mask.size(), 1); | |||
| 14799 | V2Shuffle[V2Index] = 0; | |||
| 14800 | V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle); | |||
| 14801 | } else { | |||
| 14802 | V2 = DAG.getBitcast(MVT::v16i8, V2); | |||
| 14803 | V2 = DAG.getNode( | |||
| 14804 | X86ISD::VSHLDQ, DL, MVT::v16i8, V2, | |||
| 14805 | DAG.getTargetConstant(V2Index * EltBits / 8, DL, MVT::i8)); | |||
| 14806 | V2 = DAG.getBitcast(VT, V2); | |||
| 14807 | } | |||
| 14808 | } | |||
| 14809 | return V2; | |||
| 14810 | } | |||
| 14811 | ||||
| 14812 | /// Try to lower broadcast of a single - truncated - integer element, | |||
| 14813 | /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements. | |||
| 14814 | /// | |||
| 14815 | /// This assumes we have AVX2. | |||
| 14816 | static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0, | |||
| 14817 | int BroadcastIdx, | |||
| 14818 | const X86Subtarget &Subtarget, | |||
| 14819 | SelectionDAG &DAG) { | |||
| 14820 | assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__ __PRETTY_FUNCTION__)) | |||
| 14821 | "We can only lower integer broadcasts with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower integer broadcasts with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower integer broadcasts with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14821, __extension__ __PRETTY_FUNCTION__)); | |||
| 14822 | ||||
| 14823 | MVT EltVT = VT.getVectorElementType(); | |||
| 14824 | MVT V0VT = V0.getSimpleValueType(); | |||
| 14825 | ||||
| 14826 | assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!")(static_cast <bool> (VT.isInteger() && "Unexpected non-integer trunc broadcast!" ) ? void (0) : __assert_fail ("VT.isInteger() && \"Unexpected non-integer trunc broadcast!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14826, __extension__ __PRETTY_FUNCTION__)); | |||
| 14827 | assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!")(static_cast <bool> (V0VT.isVector() && "Unexpected non-vector vector-sized value!" ) ? void (0) : __assert_fail ("V0VT.isVector() && \"Unexpected non-vector vector-sized value!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14827, __extension__ __PRETTY_FUNCTION__)); | |||
| 14828 | ||||
| 14829 | MVT V0EltVT = V0VT.getVectorElementType(); | |||
| 14830 | if (!V0EltVT.isInteger()) | |||
| 14831 | return SDValue(); | |||
| 14832 | ||||
| 14833 | const unsigned EltSize = EltVT.getSizeInBits(); | |||
| 14834 | const unsigned V0EltSize = V0EltVT.getSizeInBits(); | |||
| 14835 | ||||
| 14836 | // This is only a truncation if the original element type is larger. | |||
| 14837 | if (V0EltSize <= EltSize) | |||
| 14838 | return SDValue(); | |||
| 14839 | ||||
| 14840 | assert(((V0EltSize % EltSize) == 0) &&(static_cast <bool> (((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!") ? void ( 0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__ __PRETTY_FUNCTION__)) | |||
| 14841 | "Scalar type sizes must all be powers of 2 on x86!")(static_cast <bool> (((V0EltSize % EltSize) == 0) && "Scalar type sizes must all be powers of 2 on x86!") ? void ( 0) : __assert_fail ("((V0EltSize % EltSize) == 0) && \"Scalar type sizes must all be powers of 2 on x86!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14841, __extension__ __PRETTY_FUNCTION__)); | |||
| 14842 | ||||
| 14843 | const unsigned V0Opc = V0.getOpcode(); | |||
| 14844 | const unsigned Scale = V0EltSize / EltSize; | |||
| 14845 | const unsigned V0BroadcastIdx = BroadcastIdx / Scale; | |||
| 14846 | ||||
| 14847 | if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) && | |||
| 14848 | V0Opc != ISD::BUILD_VECTOR) | |||
| 14849 | return SDValue(); | |||
| 14850 | ||||
| 14851 | SDValue Scalar = V0.getOperand(V0BroadcastIdx); | |||
| 14852 | ||||
| 14853 | // If we're extracting non-least-significant bits, shift so we can truncate. | |||
| 14854 | // Hopefully, we can fold away the trunc/srl/load into the broadcast. | |||
| 14855 | // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer | |||
| 14856 | // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd. | |||
| 14857 | if (const int OffsetIdx = BroadcastIdx % Scale) | |||
| 14858 | Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar, | |||
| 14859 | DAG.getConstant(OffsetIdx * EltSize, DL, MVT::i8)); | |||
| 14860 | ||||
| 14861 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, | |||
| 14862 | DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar)); | |||
| 14863 | } | |||
| 14864 | ||||
| 14865 | /// Test whether this can be lowered with a single SHUFPS instruction. | |||
| 14866 | /// | |||
| 14867 | /// This is used to disable more specialized lowerings when the shufps lowering | |||
| 14868 | /// will happen to be efficient. | |||
| 14869 | static bool isSingleSHUFPSMask(ArrayRef<int> Mask) { | |||
| 14870 | // This routine only handles 128-bit shufps. | |||
| 14871 | assert(Mask.size() == 4 && "Unsupported mask size!")(static_cast <bool> (Mask.size() == 4 && "Unsupported mask size!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unsupported mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14871, __extension__ __PRETTY_FUNCTION__)); | |||
| 14872 | assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[0] >= -1 && Mask[0 ] < 8 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[0] >= -1 && Mask[0] < 8 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14872, __extension__ __PRETTY_FUNCTION__)); | |||
| 14873 | assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[1] >= -1 && Mask[1 ] < 8 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[1] >= -1 && Mask[1] < 8 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14873, __extension__ __PRETTY_FUNCTION__)); | |||
| 14874 | assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[2] >= -1 && Mask[2 ] < 8 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[2] >= -1 && Mask[2] < 8 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14874, __extension__ __PRETTY_FUNCTION__)); | |||
| 14875 | assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!")(static_cast <bool> (Mask[3] >= -1 && Mask[3 ] < 8 && "Out of bound mask element!") ? void (0) : __assert_fail ("Mask[3] >= -1 && Mask[3] < 8 && \"Out of bound mask element!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14875, __extension__ __PRETTY_FUNCTION__)); | |||
| 14876 | ||||
| 14877 | // To lower with a single SHUFPS we need to have the low half and high half | |||
| 14878 | // each requiring a single input. | |||
| 14879 | if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4)) | |||
| 14880 | return false; | |||
| 14881 | if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4)) | |||
| 14882 | return false; | |||
| 14883 | ||||
| 14884 | return true; | |||
| 14885 | } | |||
| 14886 | ||||
| 14887 | /// Test whether the specified input (0 or 1) is in-place blended by the | |||
| 14888 | /// given mask. | |||
| 14889 | /// | |||
| 14890 | /// This returns true if the elements from a particular input are already in the | |||
| 14891 | /// slot required by the given mask and require no permutation. | |||
| 14892 | static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) { | |||
| 14893 | assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.")(static_cast <bool> ((Input == 0 || Input == 1) && "Only two inputs to shuffles.") ? void (0) : __assert_fail ( "(Input == 0 || Input == 1) && \"Only two inputs to shuffles.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14893, __extension__ __PRETTY_FUNCTION__)); | |||
| 14894 | int Size = Mask.size(); | |||
| 14895 | for (int i = 0; i < Size; ++i) | |||
| 14896 | if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i) | |||
| 14897 | return false; | |||
| 14898 | ||||
| 14899 | return true; | |||
| 14900 | } | |||
| 14901 | ||||
| 14902 | /// If we are extracting two 128-bit halves of a vector and shuffling the | |||
| 14903 | /// result, match that to a 256-bit AVX2 vperm* instruction to avoid a | |||
| 14904 | /// multi-shuffle lowering. | |||
| 14905 | static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0, | |||
| 14906 | SDValue N1, ArrayRef<int> Mask, | |||
| 14907 | SelectionDAG &DAG) { | |||
| 14908 | MVT VT = N0.getSimpleValueType(); | |||
| 14909 | assert((VT.is128BitVector() &&(static_cast <bool> ((VT.is128BitVector() && (VT .getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64 )) && "VPERM* family of shuffles requires 32-bit or 64-bit elements" ) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__ __PRETTY_FUNCTION__)) | |||
| 14910 | (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&(static_cast <bool> ((VT.is128BitVector() && (VT .getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64 )) && "VPERM* family of shuffles requires 32-bit or 64-bit elements" ) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__ __PRETTY_FUNCTION__)) | |||
| 14911 | "VPERM* family of shuffles requires 32-bit or 64-bit elements")(static_cast <bool> ((VT.is128BitVector() && (VT .getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64 )) && "VPERM* family of shuffles requires 32-bit or 64-bit elements" ) ? void (0) : __assert_fail ("(VT.is128BitVector() && (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) && \"VPERM* family of shuffles requires 32-bit or 64-bit elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14911, __extension__ __PRETTY_FUNCTION__)); | |||
| 14912 | ||||
| 14913 | // Check that both sources are extracts of the same source vector. | |||
| 14914 | if (N0.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 14915 | N1.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 14916 | N0.getOperand(0) != N1.getOperand(0) || | |||
| 14917 | !N0.hasOneUse() || !N1.hasOneUse()) | |||
| 14918 | return SDValue(); | |||
| 14919 | ||||
| 14920 | SDValue WideVec = N0.getOperand(0); | |||
| 14921 | MVT WideVT = WideVec.getSimpleValueType(); | |||
| 14922 | if (!WideVT.is256BitVector()) | |||
| 14923 | return SDValue(); | |||
| 14924 | ||||
| 14925 | // Match extracts of each half of the wide source vector. Commute the shuffle | |||
| 14926 | // if the extract of the low half is N1. | |||
| 14927 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 14928 | SmallVector<int, 4> NewMask(Mask); | |||
| 14929 | const APInt &ExtIndex0 = N0.getConstantOperandAPInt(1); | |||
| 14930 | const APInt &ExtIndex1 = N1.getConstantOperandAPInt(1); | |||
| 14931 | if (ExtIndex1 == 0 && ExtIndex0 == NumElts) | |||
| 14932 | ShuffleVectorSDNode::commuteMask(NewMask); | |||
| 14933 | else if (ExtIndex0 != 0 || ExtIndex1 != NumElts) | |||
| 14934 | return SDValue(); | |||
| 14935 | ||||
| 14936 | // Final bailout: if the mask is simple, we are better off using an extract | |||
| 14937 | // and a simple narrow shuffle. Prefer extract+unpack(h/l)ps to vpermps | |||
| 14938 | // because that avoids a constant load from memory. | |||
| 14939 | if (NumElts == 4 && | |||
| 14940 | (isSingleSHUFPSMask(NewMask) || is128BitUnpackShuffleMask(NewMask, DAG))) | |||
| 14941 | return SDValue(); | |||
| 14942 | ||||
| 14943 | // Extend the shuffle mask with undef elements. | |||
| 14944 | NewMask.append(NumElts, -1); | |||
| 14945 | ||||
| 14946 | // shuf (extract X, 0), (extract X, 4), M --> extract (shuf X, undef, M'), 0 | |||
| 14947 | SDValue Shuf = DAG.getVectorShuffle(WideVT, DL, WideVec, DAG.getUNDEF(WideVT), | |||
| 14948 | NewMask); | |||
| 14949 | // This is free: ymm -> xmm. | |||
| 14950 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Shuf, | |||
| 14951 | DAG.getIntPtrConstant(0, DL)); | |||
| 14952 | } | |||
| 14953 | ||||
| 14954 | /// Try to lower broadcast of a single element. | |||
| 14955 | /// | |||
| 14956 | /// For convenience, this code also bundles all of the subtarget feature set | |||
| 14957 | /// filtering. While a little annoying to re-dispatch on type here, there isn't | |||
| 14958 | /// a convenient way to factor it out. | |||
| 14959 | static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 14960 | SDValue V2, ArrayRef<int> Mask, | |||
| 14961 | const X86Subtarget &Subtarget, | |||
| 14962 | SelectionDAG &DAG) { | |||
| 14963 | if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) || | |||
| 14964 | (Subtarget.hasAVX() && VT.isFloatingPoint()) || | |||
| 14965 | (Subtarget.hasAVX2() && VT.isInteger()))) | |||
| 14966 | return SDValue(); | |||
| 14967 | ||||
| 14968 | // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise | |||
| 14969 | // we can only broadcast from a register with AVX2. | |||
| 14970 | unsigned NumEltBits = VT.getScalarSizeInBits(); | |||
| 14971 | unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) | |||
| 14972 | ? X86ISD::MOVDDUP | |||
| 14973 | : X86ISD::VBROADCAST; | |||
| 14974 | bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); | |||
| 14975 | ||||
| 14976 | // Check that the mask is a broadcast. | |||
| 14977 | int BroadcastIdx = getSplatIndex(Mask); | |||
| 14978 | if (BroadcastIdx < 0) | |||
| 14979 | return SDValue(); | |||
| 14980 | assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "(static_cast <bool> (BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__ __PRETTY_FUNCTION__)) | |||
| 14981 | "a sorted mask where the broadcast "(static_cast <bool> (BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__ __PRETTY_FUNCTION__)) | |||
| 14982 | "comes from V1.")(static_cast <bool> (BroadcastIdx < (int)Mask.size() && "We only expect to be called with " "a sorted mask where the broadcast " "comes from V1.") ? void (0) : __assert_fail ("BroadcastIdx < (int)Mask.size() && \"We only expect to be called with \" \"a sorted mask where the broadcast \" \"comes from V1.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 14982, __extension__ __PRETTY_FUNCTION__)); | |||
| 14983 | ||||
| 14984 | // Go up the chain of (vector) values to find a scalar load that we can | |||
| 14985 | // combine with the broadcast. | |||
| 14986 | // TODO: Combine this logic with findEltLoadSrc() used by | |||
| 14987 | // EltsFromConsecutiveLoads(). | |||
| 14988 | int BitOffset = BroadcastIdx * NumEltBits; | |||
| 14989 | SDValue V = V1; | |||
| 14990 | for (;;) { | |||
| 14991 | switch (V.getOpcode()) { | |||
| 14992 | case ISD::BITCAST: { | |||
| 14993 | V = V.getOperand(0); | |||
| 14994 | continue; | |||
| 14995 | } | |||
| 14996 | case ISD::CONCAT_VECTORS: { | |||
| 14997 | int OpBitWidth = V.getOperand(0).getValueSizeInBits(); | |||
| 14998 | int OpIdx = BitOffset / OpBitWidth; | |||
| 14999 | V = V.getOperand(OpIdx); | |||
| 15000 | BitOffset %= OpBitWidth; | |||
| 15001 | continue; | |||
| 15002 | } | |||
| 15003 | case ISD::EXTRACT_SUBVECTOR: { | |||
| 15004 | // The extraction index adds to the existing offset. | |||
| 15005 | unsigned EltBitWidth = V.getScalarValueSizeInBits(); | |||
| 15006 | unsigned Idx = V.getConstantOperandVal(1); | |||
| 15007 | unsigned BeginOffset = Idx * EltBitWidth; | |||
| 15008 | BitOffset += BeginOffset; | |||
| 15009 | V = V.getOperand(0); | |||
| 15010 | continue; | |||
| 15011 | } | |||
| 15012 | case ISD::INSERT_SUBVECTOR: { | |||
| 15013 | SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1); | |||
| 15014 | int EltBitWidth = VOuter.getScalarValueSizeInBits(); | |||
| 15015 | int Idx = (int)V.getConstantOperandVal(2); | |||
| 15016 | int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements(); | |||
| 15017 | int BeginOffset = Idx * EltBitWidth; | |||
| 15018 | int EndOffset = BeginOffset + NumSubElts * EltBitWidth; | |||
| 15019 | if (BeginOffset <= BitOffset && BitOffset < EndOffset) { | |||
| 15020 | BitOffset -= BeginOffset; | |||
| 15021 | V = VInner; | |||
| 15022 | } else { | |||
| 15023 | V = VOuter; | |||
| 15024 | } | |||
| 15025 | continue; | |||
| 15026 | } | |||
| 15027 | } | |||
| 15028 | break; | |||
| 15029 | } | |||
| 15030 | assert((BitOffset % NumEltBits) == 0 && "Illegal bit-offset")(static_cast <bool> ((BitOffset % NumEltBits) == 0 && "Illegal bit-offset") ? void (0) : __assert_fail ("(BitOffset % NumEltBits) == 0 && \"Illegal bit-offset\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15030, __extension__ __PRETTY_FUNCTION__)); | |||
| 15031 | BroadcastIdx = BitOffset / NumEltBits; | |||
| 15032 | ||||
| 15033 | // Do we need to bitcast the source to retrieve the original broadcast index? | |||
| 15034 | bool BitCastSrc = V.getScalarValueSizeInBits() != NumEltBits; | |||
| 15035 | ||||
| 15036 | // Check if this is a broadcast of a scalar. We special case lowering | |||
| 15037 | // for scalars so that we can more effectively fold with loads. | |||
| 15038 | // If the original value has a larger element type than the shuffle, the | |||
| 15039 | // broadcast element is in essence truncated. Make that explicit to ease | |||
| 15040 | // folding. | |||
| 15041 | if (BitCastSrc && VT.isInteger()) | |||
| 15042 | if (SDValue TruncBroadcast = lowerShuffleAsTruncBroadcast( | |||
| 15043 | DL, VT, V, BroadcastIdx, Subtarget, DAG)) | |||
| 15044 | return TruncBroadcast; | |||
| 15045 | ||||
| 15046 | // Also check the simpler case, where we can directly reuse the scalar. | |||
| 15047 | if (!BitCastSrc && | |||
| 15048 | ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) || | |||
| 15049 | (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0))) { | |||
| 15050 | V = V.getOperand(BroadcastIdx); | |||
| 15051 | ||||
| 15052 | // If we can't broadcast from a register, check that the input is a load. | |||
| 15053 | if (!BroadcastFromReg && !isShuffleFoldableLoad(V)) | |||
| 15054 | return SDValue(); | |||
| 15055 | } else if (ISD::isNormalLoad(V.getNode()) && | |||
| 15056 | cast<LoadSDNode>(V)->isSimple()) { | |||
| 15057 | // We do not check for one-use of the vector load because a broadcast load | |||
| 15058 | // is expected to be a win for code size, register pressure, and possibly | |||
| 15059 | // uops even if the original vector load is not eliminated. | |||
| 15060 | ||||
| 15061 | // Reduce the vector load and shuffle to a broadcasted scalar load. | |||
| 15062 | LoadSDNode *Ld = cast<LoadSDNode>(V); | |||
| 15063 | SDValue BaseAddr = Ld->getOperand(1); | |||
| 15064 | MVT SVT = VT.getScalarType(); | |||
| 15065 | unsigned Offset = BroadcastIdx * SVT.getStoreSize(); | |||
| 15066 | assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset")(static_cast <bool> ((int)(Offset * 8) == BitOffset && "Unexpected bit-offset") ? void (0) : __assert_fail ("(int)(Offset * 8) == BitOffset && \"Unexpected bit-offset\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15066, __extension__ __PRETTY_FUNCTION__)); | |||
| 15067 | SDValue NewAddr = | |||
| 15068 | DAG.getMemBasePlusOffset(BaseAddr, TypeSize::Fixed(Offset), DL); | |||
| 15069 | ||||
| 15070 | // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather | |||
| 15071 | // than MOVDDUP. | |||
| 15072 | // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX? | |||
| 15073 | if (Opcode == X86ISD::VBROADCAST) { | |||
| 15074 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 15075 | SDValue Ops[] = {Ld->getChain(), NewAddr}; | |||
| 15076 | V = DAG.getMemIntrinsicNode( | |||
| 15077 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT, | |||
| 15078 | DAG.getMachineFunction().getMachineMemOperand( | |||
| 15079 | Ld->getMemOperand(), Offset, SVT.getStoreSize())); | |||
| 15080 | DAG.makeEquivalentMemoryOrdering(Ld, V); | |||
| 15081 | return DAG.getBitcast(VT, V); | |||
| 15082 | } | |||
| 15083 | assert(SVT == MVT::f64 && "Unexpected VT!")(static_cast <bool> (SVT == MVT::f64 && "Unexpected VT!" ) ? void (0) : __assert_fail ("SVT == MVT::f64 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15083, __extension__ __PRETTY_FUNCTION__)); | |||
| 15084 | V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr, | |||
| 15085 | DAG.getMachineFunction().getMachineMemOperand( | |||
| 15086 | Ld->getMemOperand(), Offset, SVT.getStoreSize())); | |||
| 15087 | DAG.makeEquivalentMemoryOrdering(Ld, V); | |||
| 15088 | } else if (!BroadcastFromReg) { | |||
| 15089 | // We can't broadcast from a vector register. | |||
| 15090 | return SDValue(); | |||
| 15091 | } else if (BitOffset != 0) { | |||
| 15092 | // We can only broadcast from the zero-element of a vector register, | |||
| 15093 | // but it can be advantageous to broadcast from the zero-element of a | |||
| 15094 | // subvector. | |||
| 15095 | if (!VT.is256BitVector() && !VT.is512BitVector()) | |||
| 15096 | return SDValue(); | |||
| 15097 | ||||
| 15098 | // VPERMQ/VPERMPD can perform the cross-lane shuffle directly. | |||
| 15099 | if (VT == MVT::v4f64 || VT == MVT::v4i64) | |||
| 15100 | return SDValue(); | |||
| 15101 | ||||
| 15102 | // Only broadcast the zero-element of a 128-bit subvector. | |||
| 15103 | if ((BitOffset % 128) != 0) | |||
| 15104 | return SDValue(); | |||
| 15105 | ||||
| 15106 | assert((BitOffset % V.getScalarValueSizeInBits()) == 0 &&(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits ()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__ __PRETTY_FUNCTION__)) | |||
| 15107 | "Unexpected bit-offset")(static_cast <bool> ((BitOffset % V.getScalarValueSizeInBits ()) == 0 && "Unexpected bit-offset") ? void (0) : __assert_fail ("(BitOffset % V.getScalarValueSizeInBits()) == 0 && \"Unexpected bit-offset\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15107, __extension__ __PRETTY_FUNCTION__)); | |||
| 15108 | assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&(static_cast <bool> ((V.getValueSizeInBits() == 256 || V .getValueSizeInBits() == 512) && "Unexpected vector size" ) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__ __PRETTY_FUNCTION__)) | |||
| 15109 | "Unexpected vector size")(static_cast <bool> ((V.getValueSizeInBits() == 256 || V .getValueSizeInBits() == 512) && "Unexpected vector size" ) ? void (0) : __assert_fail ("(V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && \"Unexpected vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15109, __extension__ __PRETTY_FUNCTION__)); | |||
| 15110 | unsigned ExtractIdx = BitOffset / V.getScalarValueSizeInBits(); | |||
| 15111 | V = extract128BitVector(V, ExtractIdx, DAG, DL); | |||
| 15112 | } | |||
| 15113 | ||||
| 15114 | // On AVX we can use VBROADCAST directly for scalar sources. | |||
| 15115 | if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) { | |||
| 15116 | V = DAG.getBitcast(MVT::f64, V); | |||
| 15117 | if (Subtarget.hasAVX()) { | |||
| 15118 | V = DAG.getNode(X86ISD::VBROADCAST, DL, MVT::v2f64, V); | |||
| 15119 | return DAG.getBitcast(VT, V); | |||
| 15120 | } | |||
| 15121 | V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V); | |||
| 15122 | } | |||
| 15123 | ||||
| 15124 | // If this is a scalar, do the broadcast on this type and bitcast. | |||
| 15125 | if (!V.getValueType().isVector()) { | |||
| 15126 | assert(V.getScalarValueSizeInBits() == NumEltBits &&(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size") ? void (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__ __PRETTY_FUNCTION__)) | |||
| 15127 | "Unexpected scalar size")(static_cast <bool> (V.getScalarValueSizeInBits() == NumEltBits && "Unexpected scalar size") ? void (0) : __assert_fail ("V.getScalarValueSizeInBits() == NumEltBits && \"Unexpected scalar size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15127, __extension__ __PRETTY_FUNCTION__)); | |||
| 15128 | MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(), | |||
| 15129 | VT.getVectorNumElements()); | |||
| 15130 | return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); | |||
| 15131 | } | |||
| 15132 | ||||
| 15133 | // We only support broadcasting from 128-bit vectors to minimize the | |||
| 15134 | // number of patterns we need to deal with in isel. So extract down to | |||
| 15135 | // 128-bits, removing as many bitcasts as possible. | |||
| 15136 | if (V.getValueSizeInBits() > 128) | |||
| 15137 | V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); | |||
| 15138 | ||||
| 15139 | // Otherwise cast V to a vector with the same element type as VT, but | |||
| 15140 | // possibly narrower than VT. Then perform the broadcast. | |||
| 15141 | unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; | |||
| 15142 | MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts); | |||
| 15143 | return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V)); | |||
| 15144 | } | |||
| 15145 | ||||
| 15146 | // Check for whether we can use INSERTPS to perform the shuffle. We only use | |||
| 15147 | // INSERTPS when the V1 elements are already in the correct locations | |||
| 15148 | // because otherwise we can just always use two SHUFPS instructions which | |||
| 15149 | // are much smaller to encode than a SHUFPS and an INSERTPS. We can also | |||
| 15150 | // perform INSERTPS if a single V1 element is out of place and all V2 | |||
| 15151 | // elements are zeroable. | |||
| 15152 | static bool matchShuffleAsInsertPS(SDValue &V1, SDValue &V2, | |||
| 15153 | unsigned &InsertPSMask, | |||
| 15154 | const APInt &Zeroable, | |||
| 15155 | ArrayRef<int> Mask, SelectionDAG &DAG) { | |||
| 15156 | assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType().is128BitVector () && "Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType().is128BitVector() && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15156, __extension__ __PRETTY_FUNCTION__)); | |||
| 15157 | assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType().is128BitVector () && "Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType().is128BitVector() && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15157, __extension__ __PRETTY_FUNCTION__)); | |||
| 15158 | assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15158, __extension__ __PRETTY_FUNCTION__)); | |||
| 15159 | ||||
| 15160 | // Attempt to match INSERTPS with one element from VA or VB being | |||
| 15161 | // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask | |||
| 15162 | // are updated. | |||
| 15163 | auto matchAsInsertPS = [&](SDValue VA, SDValue VB, | |||
| 15164 | ArrayRef<int> CandidateMask) { | |||
| 15165 | unsigned ZMask = 0; | |||
| 15166 | int VADstIndex = -1; | |||
| 15167 | int VBDstIndex = -1; | |||
| 15168 | bool VAUsedInPlace = false; | |||
| 15169 | ||||
| 15170 | for (int i = 0; i < 4; ++i) { | |||
| 15171 | // Synthesize a zero mask from the zeroable elements (includes undefs). | |||
| 15172 | if (Zeroable[i]) { | |||
| 15173 | ZMask |= 1 << i; | |||
| 15174 | continue; | |||
| 15175 | } | |||
| 15176 | ||||
| 15177 | // Flag if we use any VA inputs in place. | |||
| 15178 | if (i == CandidateMask[i]) { | |||
| 15179 | VAUsedInPlace = true; | |||
| 15180 | continue; | |||
| 15181 | } | |||
| 15182 | ||||
| 15183 | // We can only insert a single non-zeroable element. | |||
| 15184 | if (VADstIndex >= 0 || VBDstIndex >= 0) | |||
| 15185 | return false; | |||
| 15186 | ||||
| 15187 | if (CandidateMask[i] < 4) { | |||
| 15188 | // VA input out of place for insertion. | |||
| 15189 | VADstIndex = i; | |||
| 15190 | } else { | |||
| 15191 | // VB input for insertion. | |||
| 15192 | VBDstIndex = i; | |||
| 15193 | } | |||
| 15194 | } | |||
| 15195 | ||||
| 15196 | // Don't bother if we have no (non-zeroable) element for insertion. | |||
| 15197 | if (VADstIndex < 0 && VBDstIndex < 0) | |||
| 15198 | return false; | |||
| 15199 | ||||
| 15200 | // Determine element insertion src/dst indices. The src index is from the | |||
| 15201 | // start of the inserted vector, not the start of the concatenated vector. | |||
| 15202 | unsigned VBSrcIndex = 0; | |||
| 15203 | if (VADstIndex >= 0) { | |||
| 15204 | // If we have a VA input out of place, we use VA as the V2 element | |||
| 15205 | // insertion and don't use the original V2 at all. | |||
| 15206 | VBSrcIndex = CandidateMask[VADstIndex]; | |||
| 15207 | VBDstIndex = VADstIndex; | |||
| 15208 | VB = VA; | |||
| 15209 | } else { | |||
| 15210 | VBSrcIndex = CandidateMask[VBDstIndex] - 4; | |||
| 15211 | } | |||
| 15212 | ||||
| 15213 | // If no V1 inputs are used in place, then the result is created only from | |||
| 15214 | // the zero mask and the V2 insertion - so remove V1 dependency. | |||
| 15215 | if (!VAUsedInPlace) | |||
| 15216 | VA = DAG.getUNDEF(MVT::v4f32); | |||
| 15217 | ||||
| 15218 | // Update V1, V2 and InsertPSMask accordingly. | |||
| 15219 | V1 = VA; | |||
| 15220 | V2 = VB; | |||
| 15221 | ||||
| 15222 | // Insert the V2 element into the desired position. | |||
| 15223 | InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask; | |||
| 15224 | assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!")(static_cast <bool> ((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!") ? void (0) : __assert_fail ("(InsertPSMask & ~0xFFu) == 0 && \"Invalid mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15224, __extension__ __PRETTY_FUNCTION__)); | |||
| 15225 | return true; | |||
| 15226 | }; | |||
| 15227 | ||||
| 15228 | if (matchAsInsertPS(V1, V2, Mask)) | |||
| 15229 | return true; | |||
| 15230 | ||||
| 15231 | // Commute and try again. | |||
| 15232 | SmallVector<int, 4> CommutedMask(Mask); | |||
| 15233 | ShuffleVectorSDNode::commuteMask(CommutedMask); | |||
| 15234 | if (matchAsInsertPS(V2, V1, CommutedMask)) | |||
| 15235 | return true; | |||
| 15236 | ||||
| 15237 | return false; | |||
| 15238 | } | |||
| 15239 | ||||
| 15240 | static SDValue lowerShuffleAsInsertPS(const SDLoc &DL, SDValue V1, SDValue V2, | |||
| 15241 | ArrayRef<int> Mask, const APInt &Zeroable, | |||
| 15242 | SelectionDAG &DAG) { | |||
| 15243 | assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15243, __extension__ __PRETTY_FUNCTION__)); | |||
| 15244 | assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15244, __extension__ __PRETTY_FUNCTION__)); | |||
| 15245 | ||||
| 15246 | // Attempt to match the insertps pattern. | |||
| 15247 | unsigned InsertPSMask = 0; | |||
| 15248 | if (!matchShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG)) | |||
| 15249 | return SDValue(); | |||
| 15250 | ||||
| 15251 | // Insert the V2 element into the desired position. | |||
| 15252 | return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2, | |||
| 15253 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 15254 | } | |||
| 15255 | ||||
| 15256 | /// Handle lowering of 2-lane 64-bit floating point shuffles. | |||
| 15257 | /// | |||
| 15258 | /// This is the basis function for the 2-lane 64-bit shuffles as we have full | |||
| 15259 | /// support for floating point shuffles but not integer shuffles. These | |||
| 15260 | /// instructions will incur a domain crossing penalty on some chips though so | |||
| 15261 | /// it is better to avoid lowering through this for integer vectors where | |||
| 15262 | /// possible. | |||
| 15263 | static SDValue lowerV2F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 15264 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 15265 | const X86Subtarget &Subtarget, | |||
| 15266 | SelectionDAG &DAG) { | |||
| 15267 | assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15267, __extension__ __PRETTY_FUNCTION__)); | |||
| 15268 | assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v2f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15268, __extension__ __PRETTY_FUNCTION__)); | |||
| 15269 | assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15269, __extension__ __PRETTY_FUNCTION__)); | |||
| 15270 | ||||
| 15271 | if (V2.isUndef()) { | |||
| 15272 | // Check for being able to broadcast a single element. | |||
| 15273 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2f64, V1, V2, | |||
| 15274 | Mask, Subtarget, DAG)) | |||
| 15275 | return Broadcast; | |||
| 15276 | ||||
| 15277 | // Straight shuffle of a single input vector. Simulate this by using the | |||
| 15278 | // single input as both of the "inputs" to this instruction.. | |||
| 15279 | unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); | |||
| 15280 | ||||
| 15281 | if (Subtarget.hasAVX()) { | |||
| 15282 | // If we have AVX, we can use VPERMILPS which will allow folding a load | |||
| 15283 | // into the shuffle. | |||
| 15284 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1, | |||
| 15285 | DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); | |||
| 15286 | } | |||
| 15287 | ||||
| 15288 | return DAG.getNode( | |||
| 15289 | X86ISD::SHUFP, DL, MVT::v2f64, | |||
| 15290 | Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, | |||
| 15291 | Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1, | |||
| 15292 | DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); | |||
| 15293 | } | |||
| 15294 | assert(Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] >= 0 && "No undef lanes in multi-input v2 shuffles!" ) ? void (0) : __assert_fail ("Mask[0] >= 0 && \"No undef lanes in multi-input v2 shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15294, __extension__ __PRETTY_FUNCTION__)); | |||
| 15295 | assert(Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] >= 0 && "No undef lanes in multi-input v2 shuffles!" ) ? void (0) : __assert_fail ("Mask[1] >= 0 && \"No undef lanes in multi-input v2 shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15295, __extension__ __PRETTY_FUNCTION__)); | |||
| 15296 | assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input." ) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15296, __extension__ __PRETTY_FUNCTION__)); | |||
| 15297 | assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input." ) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15297, __extension__ __PRETTY_FUNCTION__)); | |||
| 15298 | ||||
| 15299 | if (Subtarget.hasAVX2()) | |||
| 15300 | if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) | |||
| 15301 | return Extract; | |||
| 15302 | ||||
| 15303 | // When loading a scalar and then shuffling it into a vector we can often do | |||
| 15304 | // the insertion cheaply. | |||
| 15305 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 15306 | DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 15307 | return Insertion; | |||
| 15308 | // Try inverting the insertion since for v2 masks it is easy to do and we | |||
| 15309 | // can't reliably sort the mask one way or the other. | |||
| 15310 | int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2), | |||
| 15311 | Mask[1] < 0 ? -1 : (Mask[1] ^ 2)}; | |||
| 15312 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 15313 | DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) | |||
| 15314 | return Insertion; | |||
| 15315 | ||||
| 15316 | // Try to use one of the special instruction patterns to handle two common | |||
| 15317 | // blend patterns if a zero-blend above didn't work. | |||
| 15318 | if (isShuffleEquivalent(Mask, {0, 3}, V1, V2) || | |||
| 15319 | isShuffleEquivalent(Mask, {1, 3}, V1, V2)) | |||
| 15320 | if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG)) | |||
| 15321 | // We can either use a special instruction to load over the low double or | |||
| 15322 | // to move just the low double. | |||
| 15323 | return DAG.getNode( | |||
| 15324 | X86ISD::MOVSD, DL, MVT::v2f64, V2, | |||
| 15325 | DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S)); | |||
| 15326 | ||||
| 15327 | if (Subtarget.hasSSE41()) | |||
| 15328 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask, | |||
| 15329 | Zeroable, Subtarget, DAG)) | |||
| 15330 | return Blend; | |||
| 15331 | ||||
| 15332 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 15333 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG)) | |||
| 15334 | return V; | |||
| 15335 | ||||
| 15336 | unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); | |||
| 15337 | return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2, | |||
| 15338 | DAG.getTargetConstant(SHUFPDMask, DL, MVT::i8)); | |||
| 15339 | } | |||
| 15340 | ||||
| 15341 | /// Handle lowering of 2-lane 64-bit integer shuffles. | |||
| 15342 | /// | |||
| 15343 | /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by | |||
| 15344 | /// the integer unit to minimize domain crossing penalties. However, for blends | |||
| 15345 | /// it falls back to the floating point shuffle operation with appropriate bit | |||
| 15346 | /// casting. | |||
| 15347 | static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 15348 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 15349 | const X86Subtarget &Subtarget, | |||
| 15350 | SelectionDAG &DAG) { | |||
| 15351 | assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15351, __extension__ __PRETTY_FUNCTION__)); | |||
| 15352 | assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v2i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15352, __extension__ __PRETTY_FUNCTION__)); | |||
| 15353 | assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!")(static_cast <bool> (Mask.size() == 2 && "Unexpected mask size for v2 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 2 && \"Unexpected mask size for v2 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15353, __extension__ __PRETTY_FUNCTION__)); | |||
| 15354 | ||||
| 15355 | if (V2.isUndef()) { | |||
| 15356 | // Check for being able to broadcast a single element. | |||
| 15357 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v2i64, V1, V2, | |||
| 15358 | Mask, Subtarget, DAG)) | |||
| 15359 | return Broadcast; | |||
| 15360 | ||||
| 15361 | // Straight shuffle of a single input vector. For everything from SSE2 | |||
| 15362 | // onward this has a single fast instruction with no scary immediates. | |||
| 15363 | // We have to map the mask as it is actually a v4i32 shuffle instruction. | |||
| 15364 | V1 = DAG.getBitcast(MVT::v4i32, V1); | |||
| 15365 | int WidenedMask[4] = {Mask[0] < 0 ? -1 : (Mask[0] * 2), | |||
| 15366 | Mask[0] < 0 ? -1 : ((Mask[0] * 2) + 1), | |||
| 15367 | Mask[1] < 0 ? -1 : (Mask[1] * 2), | |||
| 15368 | Mask[1] < 0 ? -1 : ((Mask[1] * 2) + 1)}; | |||
| 15369 | return DAG.getBitcast( | |||
| 15370 | MVT::v2i64, | |||
| 15371 | DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, | |||
| 15372 | getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG))); | |||
| 15373 | } | |||
| 15374 | assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!" ) ? void (0) : __assert_fail ("Mask[0] != -1 && \"No undef lanes in multi-input v2 shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15374, __extension__ __PRETTY_FUNCTION__)); | |||
| 15375 | assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!")(static_cast <bool> (Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!" ) ? void (0) : __assert_fail ("Mask[1] != -1 && \"No undef lanes in multi-input v2 shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15375, __extension__ __PRETTY_FUNCTION__)); | |||
| 15376 | assert(Mask[0] < 2 && "We sort V1 to be the first input.")(static_cast <bool> (Mask[0] < 2 && "We sort V1 to be the first input." ) ? void (0) : __assert_fail ("Mask[0] < 2 && \"We sort V1 to be the first input.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15376, __extension__ __PRETTY_FUNCTION__)); | |||
| 15377 | assert(Mask[1] >= 2 && "We sort V2 to be the second input.")(static_cast <bool> (Mask[1] >= 2 && "We sort V2 to be the second input." ) ? void (0) : __assert_fail ("Mask[1] >= 2 && \"We sort V2 to be the second input.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15377, __extension__ __PRETTY_FUNCTION__)); | |||
| 15378 | ||||
| 15379 | if (Subtarget.hasAVX2()) | |||
| 15380 | if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) | |||
| 15381 | return Extract; | |||
| 15382 | ||||
| 15383 | // Try to use shift instructions. | |||
| 15384 | if (SDValue Shift = | |||
| 15385 | lowerShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, | |||
| 15386 | DAG, /*BitwiseOnly*/ false)) | |||
| 15387 | return Shift; | |||
| 15388 | ||||
| 15389 | // When loading a scalar and then shuffling it into a vector we can often do | |||
| 15390 | // the insertion cheaply. | |||
| 15391 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 15392 | DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 15393 | return Insertion; | |||
| 15394 | // Try inverting the insertion since for v2 masks it is easy to do and we | |||
| 15395 | // can't reliably sort the mask one way or the other. | |||
| 15396 | int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2}; | |||
| 15397 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 15398 | DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG)) | |||
| 15399 | return Insertion; | |||
| 15400 | ||||
| 15401 | // We have different paths for blend lowering, but they all must use the | |||
| 15402 | // *exact* same predicate. | |||
| 15403 | bool IsBlendSupported = Subtarget.hasSSE41(); | |||
| 15404 | if (IsBlendSupported) | |||
| 15405 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask, | |||
| 15406 | Zeroable, Subtarget, DAG)) | |||
| 15407 | return Blend; | |||
| 15408 | ||||
| 15409 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 15410 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG)) | |||
| 15411 | return V; | |||
| 15412 | ||||
| 15413 | // Try to use byte rotation instructions. | |||
| 15414 | // Its more profitable for pre-SSSE3 to use shuffles/unpacks. | |||
| 15415 | if (Subtarget.hasSSSE3()) { | |||
| 15416 | if (Subtarget.hasVLX()) | |||
| 15417 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask, | |||
| 15418 | Subtarget, DAG)) | |||
| 15419 | return Rotate; | |||
| 15420 | ||||
| 15421 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v2i64, V1, V2, Mask, | |||
| 15422 | Subtarget, DAG)) | |||
| 15423 | return Rotate; | |||
| 15424 | } | |||
| 15425 | ||||
| 15426 | // If we have direct support for blends, we should lower by decomposing into | |||
| 15427 | // a permute. That will be faster than the domain cross. | |||
| 15428 | if (IsBlendSupported) | |||
| 15429 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v2i64, V1, V2, Mask, | |||
| 15430 | Subtarget, DAG); | |||
| 15431 | ||||
| 15432 | // We implement this with SHUFPD which is pretty lame because it will likely | |||
| 15433 | // incur 2 cycles of stall for integer vectors on Nehalem and older chips. | |||
| 15434 | // However, all the alternatives are still more cycles and newer chips don't | |||
| 15435 | // have this problem. It would be really nice if x86 had better shuffles here. | |||
| 15436 | V1 = DAG.getBitcast(MVT::v2f64, V1); | |||
| 15437 | V2 = DAG.getBitcast(MVT::v2f64, V2); | |||
| 15438 | return DAG.getBitcast(MVT::v2i64, | |||
| 15439 | DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); | |||
| 15440 | } | |||
| 15441 | ||||
| 15442 | /// Lower a vector shuffle using the SHUFPS instruction. | |||
| 15443 | /// | |||
| 15444 | /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS. | |||
| 15445 | /// It makes no assumptions about whether this is the *best* lowering, it simply | |||
| 15446 | /// uses it. | |||
| 15447 | static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT, | |||
| 15448 | ArrayRef<int> Mask, SDValue V1, | |||
| 15449 | SDValue V2, SelectionDAG &DAG) { | |||
| 15450 | SDValue LowV = V1, HighV = V2; | |||
| 15451 | SmallVector<int, 4> NewMask(Mask); | |||
| 15452 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); | |||
| 15453 | ||||
| 15454 | if (NumV2Elements == 1) { | |||
| 15455 | int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin(); | |||
| 15456 | ||||
| 15457 | // Compute the index adjacent to V2Index and in the same half by toggling | |||
| 15458 | // the low bit. | |||
| 15459 | int V2AdjIndex = V2Index ^ 1; | |||
| 15460 | ||||
| 15461 | if (Mask[V2AdjIndex] < 0) { | |||
| 15462 | // Handles all the cases where we have a single V2 element and an undef. | |||
| 15463 | // This will only ever happen in the high lanes because we commute the | |||
| 15464 | // vector otherwise. | |||
| 15465 | if (V2Index < 2) | |||
| 15466 | std::swap(LowV, HighV); | |||
| 15467 | NewMask[V2Index] -= 4; | |||
| 15468 | } else { | |||
| 15469 | // Handle the case where the V2 element ends up adjacent to a V1 element. | |||
| 15470 | // To make this work, blend them together as the first step. | |||
| 15471 | int V1Index = V2AdjIndex; | |||
| 15472 | int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; | |||
| 15473 | V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1, | |||
| 15474 | getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); | |||
| 15475 | ||||
| 15476 | // Now proceed to reconstruct the final blend as we have the necessary | |||
| 15477 | // high or low half formed. | |||
| 15478 | if (V2Index < 2) { | |||
| 15479 | LowV = V2; | |||
| 15480 | HighV = V1; | |||
| 15481 | } else { | |||
| 15482 | HighV = V2; | |||
| 15483 | } | |||
| 15484 | NewMask[V1Index] = 2; // We put the V1 element in V2[2]. | |||
| 15485 | NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. | |||
| 15486 | } | |||
| 15487 | } else if (NumV2Elements == 2) { | |||
| 15488 | if (Mask[0] < 4 && Mask[1] < 4) { | |||
| 15489 | // Handle the easy case where we have V1 in the low lanes and V2 in the | |||
| 15490 | // high lanes. | |||
| 15491 | NewMask[2] -= 4; | |||
| 15492 | NewMask[3] -= 4; | |||
| 15493 | } else if (Mask[2] < 4 && Mask[3] < 4) { | |||
| 15494 | // We also handle the reversed case because this utility may get called | |||
| 15495 | // when we detect a SHUFPS pattern but can't easily commute the shuffle to | |||
| 15496 | // arrange things in the right direction. | |||
| 15497 | NewMask[0] -= 4; | |||
| 15498 | NewMask[1] -= 4; | |||
| 15499 | HighV = V1; | |||
| 15500 | LowV = V2; | |||
| 15501 | } else { | |||
| 15502 | // We have a mixture of V1 and V2 in both low and high lanes. Rather than | |||
| 15503 | // trying to place elements directly, just blend them and set up the final | |||
| 15504 | // shuffle to place them. | |||
| 15505 | ||||
| 15506 | // The first two blend mask elements are for V1, the second two are for | |||
| 15507 | // V2. | |||
| 15508 | int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], | |||
| 15509 | Mask[2] < 4 ? Mask[2] : Mask[3], | |||
| 15510 | (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, | |||
| 15511 | (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; | |||
| 15512 | V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, | |||
| 15513 | getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG)); | |||
| 15514 | ||||
| 15515 | // Now we do a normal shuffle of V1 by giving V1 as both operands to | |||
| 15516 | // a blend. | |||
| 15517 | LowV = HighV = V1; | |||
| 15518 | NewMask[0] = Mask[0] < 4 ? 0 : 2; | |||
| 15519 | NewMask[1] = Mask[0] < 4 ? 2 : 0; | |||
| 15520 | NewMask[2] = Mask[2] < 4 ? 1 : 3; | |||
| 15521 | NewMask[3] = Mask[2] < 4 ? 3 : 1; | |||
| 15522 | } | |||
| 15523 | } else if (NumV2Elements == 3) { | |||
| 15524 | // Ideally canonicalizeShuffleMaskWithCommute should have caught this, but | |||
| 15525 | // we can get here due to other paths (e.g repeated mask matching) that we | |||
| 15526 | // don't want to do another round of lowerVECTOR_SHUFFLE. | |||
| 15527 | ShuffleVectorSDNode::commuteMask(NewMask); | |||
| 15528 | return lowerShuffleWithSHUFPS(DL, VT, NewMask, V2, V1, DAG); | |||
| 15529 | } | |||
| 15530 | return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV, | |||
| 15531 | getV4X86ShuffleImm8ForMask(NewMask, DL, DAG)); | |||
| 15532 | } | |||
| 15533 | ||||
| 15534 | /// Lower 4-lane 32-bit floating point shuffles. | |||
| 15535 | /// | |||
| 15536 | /// Uses instructions exclusively from the floating point unit to minimize | |||
| 15537 | /// domain crossing penalties, as these are sufficient to implement all v4f32 | |||
| 15538 | /// shuffles. | |||
| 15539 | static SDValue lowerV4F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 15540 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 15541 | const X86Subtarget &Subtarget, | |||
| 15542 | SelectionDAG &DAG) { | |||
| 15543 | assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15543, __extension__ __PRETTY_FUNCTION__)); | |||
| 15544 | assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v4f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15544, __extension__ __PRETTY_FUNCTION__)); | |||
| 15545 | assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15545, __extension__ __PRETTY_FUNCTION__)); | |||
| 15546 | ||||
| 15547 | if (Subtarget.hasSSE41()) | |||
| 15548 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask, | |||
| 15549 | Zeroable, Subtarget, DAG)) | |||
| 15550 | return Blend; | |||
| 15551 | ||||
| 15552 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); | |||
| 15553 | ||||
| 15554 | if (NumV2Elements == 0) { | |||
| 15555 | // Check for being able to broadcast a single element. | |||
| 15556 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f32, V1, V2, | |||
| 15557 | Mask, Subtarget, DAG)) | |||
| 15558 | return Broadcast; | |||
| 15559 | ||||
| 15560 | // Use even/odd duplicate instructions for masks that match their pattern. | |||
| 15561 | if (Subtarget.hasSSE3()) { | |||
| 15562 | if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) | |||
| 15563 | return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1); | |||
| 15564 | if (isShuffleEquivalent(Mask, {1, 1, 3, 3}, V1, V2)) | |||
| 15565 | return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1); | |||
| 15566 | } | |||
| 15567 | ||||
| 15568 | if (Subtarget.hasAVX()) { | |||
| 15569 | // If we have AVX, we can use VPERMILPS which will allow folding a load | |||
| 15570 | // into the shuffle. | |||
| 15571 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1, | |||
| 15572 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 15573 | } | |||
| 15574 | ||||
| 15575 | // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid | |||
| 15576 | // in SSE1 because otherwise they are widened to v2f64 and never get here. | |||
| 15577 | if (!Subtarget.hasSSE2()) { | |||
| 15578 | if (isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2)) | |||
| 15579 | return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); | |||
| 15580 | if (isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1, V2)) | |||
| 15581 | return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); | |||
| 15582 | } | |||
| 15583 | ||||
| 15584 | // Otherwise, use a straight shuffle of a single input vector. We pass the | |||
| 15585 | // input vector to both operands to simulate this with a SHUFPS. | |||
| 15586 | return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, | |||
| 15587 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 15588 | } | |||
| 15589 | ||||
| 15590 | if (Subtarget.hasSSE2()) | |||
| 15591 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 15592 | DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) { | |||
| 15593 | ZExt = DAG.getBitcast(MVT::v4f32, ZExt); | |||
| 15594 | return ZExt; | |||
| 15595 | } | |||
| 15596 | ||||
| 15597 | if (Subtarget.hasAVX2()) | |||
| 15598 | if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) | |||
| 15599 | return Extract; | |||
| 15600 | ||||
| 15601 | // There are special ways we can lower some single-element blends. However, we | |||
| 15602 | // have custom ways we can lower more complex single-element blends below that | |||
| 15603 | // we defer to if both this and BLENDPS fail to match, so restrict this to | |||
| 15604 | // when the V2 input is targeting element 0 of the mask -- that is the fast | |||
| 15605 | // case here. | |||
| 15606 | if (NumV2Elements == 1 && Mask[0] >= 4) | |||
| 15607 | if (SDValue V = lowerShuffleAsElementInsertion( | |||
| 15608 | DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 15609 | return V; | |||
| 15610 | ||||
| 15611 | if (Subtarget.hasSSE41()) { | |||
| 15612 | // Use INSERTPS if we can complete the shuffle efficiently. | |||
| 15613 | if (SDValue V = lowerShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG)) | |||
| 15614 | return V; | |||
| 15615 | ||||
| 15616 | if (!isSingleSHUFPSMask(Mask)) | |||
| 15617 | if (SDValue BlendPerm = lowerShuffleAsBlendAndPermute(DL, MVT::v4f32, V1, | |||
| 15618 | V2, Mask, DAG)) | |||
| 15619 | return BlendPerm; | |||
| 15620 | } | |||
| 15621 | ||||
| 15622 | // Use low/high mov instructions. These are only valid in SSE1 because | |||
| 15623 | // otherwise they are widened to v2f64 and never get here. | |||
| 15624 | if (!Subtarget.hasSSE2()) { | |||
| 15625 | if (isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) | |||
| 15626 | return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); | |||
| 15627 | if (isShuffleEquivalent(Mask, {2, 3, 6, 7}, V1, V2)) | |||
| 15628 | return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); | |||
| 15629 | } | |||
| 15630 | ||||
| 15631 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 15632 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG)) | |||
| 15633 | return V; | |||
| 15634 | ||||
| 15635 | // Otherwise fall back to a SHUFPS lowering strategy. | |||
| 15636 | return lowerShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG); | |||
| 15637 | } | |||
| 15638 | ||||
| 15639 | /// Lower 4-lane i32 vector shuffles. | |||
| 15640 | /// | |||
| 15641 | /// We try to handle these with integer-domain shuffles where we can, but for | |||
| 15642 | /// blends we use the floating point domain blend instructions. | |||
| 15643 | static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 15644 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 15645 | const X86Subtarget &Subtarget, | |||
| 15646 | SelectionDAG &DAG) { | |||
| 15647 | assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15647, __extension__ __PRETTY_FUNCTION__)); | |||
| 15648 | assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v4i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15648, __extension__ __PRETTY_FUNCTION__)); | |||
| 15649 | assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15649, __extension__ __PRETTY_FUNCTION__)); | |||
| 15650 | ||||
| 15651 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 15652 | // than any alternative. It also allows us to fold memory operands into the | |||
| 15653 | // shuffle in many cases. | |||
| 15654 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15655 | Zeroable, Subtarget, DAG)) | |||
| 15656 | return ZExt; | |||
| 15657 | ||||
| 15658 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; }); | |||
| 15659 | ||||
| 15660 | // Try to use shift instructions if fast. | |||
| 15661 | if (Subtarget.preferLowerShuffleAsShift()) { | |||
| 15662 | if (SDValue Shift = | |||
| 15663 | lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, | |||
| 15664 | Subtarget, DAG, /*BitwiseOnly*/ true)) | |||
| 15665 | return Shift; | |||
| 15666 | if (NumV2Elements == 0) | |||
| 15667 | if (SDValue Rotate = | |||
| 15668 | lowerShuffleAsBitRotate(DL, MVT::v4i32, V1, Mask, Subtarget, DAG)) | |||
| 15669 | return Rotate; | |||
| 15670 | } | |||
| 15671 | ||||
| 15672 | if (NumV2Elements == 0) { | |||
| 15673 | // Try to use broadcast unless the mask only has one non-undef element. | |||
| 15674 | if (count_if(Mask, [](int M) { return M >= 0 && M < 4; }) > 1) { | |||
| 15675 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i32, V1, V2, | |||
| 15676 | Mask, Subtarget, DAG)) | |||
| 15677 | return Broadcast; | |||
| 15678 | } | |||
| 15679 | ||||
| 15680 | // Straight shuffle of a single input vector. For everything from SSE2 | |||
| 15681 | // onward this has a single fast instruction with no scary immediates. | |||
| 15682 | // We coerce the shuffle pattern to be compatible with UNPCK instructions | |||
| 15683 | // but we aren't actually going to use the UNPCK instruction because doing | |||
| 15684 | // so prevents folding a load into this instruction or making a copy. | |||
| 15685 | const int UnpackLoMask[] = {0, 0, 1, 1}; | |||
| 15686 | const int UnpackHiMask[] = {2, 2, 3, 3}; | |||
| 15687 | if (isShuffleEquivalent(Mask, {0, 0, 1, 1}, V1, V2)) | |||
| 15688 | Mask = UnpackLoMask; | |||
| 15689 | else if (isShuffleEquivalent(Mask, {2, 2, 3, 3}, V1, V2)) | |||
| 15690 | Mask = UnpackHiMask; | |||
| 15691 | ||||
| 15692 | return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, | |||
| 15693 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 15694 | } | |||
| 15695 | ||||
| 15696 | if (Subtarget.hasAVX2()) | |||
| 15697 | if (SDValue Extract = lowerShuffleOfExtractsAsVperm(DL, V1, V2, Mask, DAG)) | |||
| 15698 | return Extract; | |||
| 15699 | ||||
| 15700 | // Try to use shift instructions. | |||
| 15701 | if (SDValue Shift = | |||
| 15702 | lowerShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, | |||
| 15703 | DAG, /*BitwiseOnly*/ false)) | |||
| 15704 | return Shift; | |||
| 15705 | ||||
| 15706 | // There are special ways we can lower some single-element blends. | |||
| 15707 | if (NumV2Elements == 1) | |||
| 15708 | if (SDValue V = lowerShuffleAsElementInsertion( | |||
| 15709 | DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 15710 | return V; | |||
| 15711 | ||||
| 15712 | // We have different paths for blend lowering, but they all must use the | |||
| 15713 | // *exact* same predicate. | |||
| 15714 | bool IsBlendSupported = Subtarget.hasSSE41(); | |||
| 15715 | if (IsBlendSupported) | |||
| 15716 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15717 | Zeroable, Subtarget, DAG)) | |||
| 15718 | return Blend; | |||
| 15719 | ||||
| 15720 | if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15721 | Zeroable, Subtarget, DAG)) | |||
| 15722 | return Masked; | |||
| 15723 | ||||
| 15724 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 15725 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG)) | |||
| 15726 | return V; | |||
| 15727 | ||||
| 15728 | // Try to use byte rotation instructions. | |||
| 15729 | // Its more profitable for pre-SSSE3 to use shuffles/unpacks. | |||
| 15730 | if (Subtarget.hasSSSE3()) { | |||
| 15731 | if (Subtarget.hasVLX()) | |||
| 15732 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15733 | Subtarget, DAG)) | |||
| 15734 | return Rotate; | |||
| 15735 | ||||
| 15736 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15737 | Subtarget, DAG)) | |||
| 15738 | return Rotate; | |||
| 15739 | } | |||
| 15740 | ||||
| 15741 | // Assume that a single SHUFPS is faster than an alternative sequence of | |||
| 15742 | // multiple instructions (even if the CPU has a domain penalty). | |||
| 15743 | // If some CPU is harmed by the domain switch, we can fix it in a later pass. | |||
| 15744 | if (!isSingleSHUFPSMask(Mask)) { | |||
| 15745 | // If we have direct support for blends, we should lower by decomposing into | |||
| 15746 | // a permute. That will be faster than the domain cross. | |||
| 15747 | if (IsBlendSupported) | |||
| 15748 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i32, V1, V2, Mask, | |||
| 15749 | Subtarget, DAG); | |||
| 15750 | ||||
| 15751 | // Try to lower by permuting the inputs into an unpack instruction. | |||
| 15752 | if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v4i32, V1, V2, | |||
| 15753 | Mask, Subtarget, DAG)) | |||
| 15754 | return Unpack; | |||
| 15755 | } | |||
| 15756 | ||||
| 15757 | // We implement this with SHUFPS because it can blend from two vectors. | |||
| 15758 | // Because we're going to eventually use SHUFPS, we use SHUFPS even to build | |||
| 15759 | // up the inputs, bypassing domain shift penalties that we would incur if we | |||
| 15760 | // directly used PSHUFD on Nehalem and older. For newer chips, this isn't | |||
| 15761 | // relevant. | |||
| 15762 | SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1); | |||
| 15763 | SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2); | |||
| 15764 | SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask); | |||
| 15765 | return DAG.getBitcast(MVT::v4i32, ShufPS); | |||
| 15766 | } | |||
| 15767 | ||||
| 15768 | /// Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 | |||
| 15769 | /// shuffle lowering, and the most complex part. | |||
| 15770 | /// | |||
| 15771 | /// The lowering strategy is to try to form pairs of input lanes which are | |||
| 15772 | /// targeted at the same half of the final vector, and then use a dword shuffle | |||
| 15773 | /// to place them onto the right half, and finally unpack the paired lanes into | |||
| 15774 | /// their final position. | |||
| 15775 | /// | |||
| 15776 | /// The exact breakdown of how to form these dword pairs and align them on the | |||
| 15777 | /// correct sides is really tricky. See the comments within the function for | |||
| 15778 | /// more of the details. | |||
| 15779 | /// | |||
| 15780 | /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each | |||
| 15781 | /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to | |||
| 15782 | /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16 | |||
| 15783 | /// vector, form the analogous 128-bit 8-element Mask. | |||
| 15784 | static SDValue lowerV8I16GeneralSingleInputShuffle( | |||
| 15785 | const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask, | |||
| 15786 | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 15787 | assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!")(static_cast <bool> (VT.getVectorElementType() == MVT:: i16 && "Bad input type!") ? void (0) : __assert_fail ( "VT.getVectorElementType() == MVT::i16 && \"Bad input type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15787, __extension__ __PRETTY_FUNCTION__)); | |||
| 15788 | MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); | |||
| 15789 | ||||
| 15790 | assert(Mask.size() == 8 && "Shuffle mask length doesn't match!")(static_cast <bool> (Mask.size() == 8 && "Shuffle mask length doesn't match!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Shuffle mask length doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15790, __extension__ __PRETTY_FUNCTION__)); | |||
| 15791 | MutableArrayRef<int> LoMask = Mask.slice(0, 4); | |||
| 15792 | MutableArrayRef<int> HiMask = Mask.slice(4, 4); | |||
| 15793 | ||||
| 15794 | // Attempt to directly match PSHUFLW or PSHUFHW. | |||
| 15795 | if (isUndefOrInRange(LoMask, 0, 4) && | |||
| 15796 | isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { | |||
| 15797 | return DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, | |||
| 15798 | getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); | |||
| 15799 | } | |||
| 15800 | if (isUndefOrInRange(HiMask, 4, 8) && | |||
| 15801 | isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { | |||
| 15802 | for (int i = 0; i != 4; ++i) | |||
| 15803 | HiMask[i] = (HiMask[i] < 0 ? HiMask[i] : (HiMask[i] - 4)); | |||
| 15804 | return DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, | |||
| 15805 | getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); | |||
| 15806 | } | |||
| 15807 | ||||
| 15808 | SmallVector<int, 4> LoInputs; | |||
| 15809 | copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; }); | |||
| 15810 | array_pod_sort(LoInputs.begin(), LoInputs.end()); | |||
| 15811 | LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); | |||
| 15812 | SmallVector<int, 4> HiInputs; | |||
| 15813 | copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; }); | |||
| 15814 | array_pod_sort(HiInputs.begin(), HiInputs.end()); | |||
| 15815 | HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); | |||
| 15816 | int NumLToL = llvm::lower_bound(LoInputs, 4) - LoInputs.begin(); | |||
| 15817 | int NumHToL = LoInputs.size() - NumLToL; | |||
| 15818 | int NumLToH = llvm::lower_bound(HiInputs, 4) - HiInputs.begin(); | |||
| 15819 | int NumHToH = HiInputs.size() - NumLToH; | |||
| 15820 | MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL); | |||
| 15821 | MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH); | |||
| 15822 | MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL); | |||
| 15823 | MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH); | |||
| 15824 | ||||
| 15825 | // If we are shuffling values from one half - check how many different DWORD | |||
| 15826 | // pairs we need to create. If only 1 or 2 then we can perform this as a | |||
| 15827 | // PSHUFLW/PSHUFHW + PSHUFD instead of the PSHUFD+PSHUFLW+PSHUFHW chain below. | |||
| 15828 | auto ShuffleDWordPairs = [&](ArrayRef<int> PSHUFHalfMask, | |||
| 15829 | ArrayRef<int> PSHUFDMask, unsigned ShufWOp) { | |||
| 15830 | V = DAG.getNode(ShufWOp, DL, VT, V, | |||
| 15831 | getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); | |||
| 15832 | V = DAG.getBitcast(PSHUFDVT, V); | |||
| 15833 | V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V, | |||
| 15834 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)); | |||
| 15835 | return DAG.getBitcast(VT, V); | |||
| 15836 | }; | |||
| 15837 | ||||
| 15838 | if ((NumHToL + NumHToH) == 0 || (NumLToL + NumLToH) == 0) { | |||
| 15839 | int PSHUFDMask[4] = { -1, -1, -1, -1 }; | |||
| 15840 | SmallVector<std::pair<int, int>, 4> DWordPairs; | |||
| 15841 | int DOffset = ((NumHToL + NumHToH) == 0 ? 0 : 2); | |||
| 15842 | ||||
| 15843 | // Collect the different DWORD pairs. | |||
| 15844 | for (int DWord = 0; DWord != 4; ++DWord) { | |||
| 15845 | int M0 = Mask[2 * DWord + 0]; | |||
| 15846 | int M1 = Mask[2 * DWord + 1]; | |||
| 15847 | M0 = (M0 >= 0 ? M0 % 4 : M0); | |||
| 15848 | M1 = (M1 >= 0 ? M1 % 4 : M1); | |||
| 15849 | if (M0 < 0 && M1 < 0) | |||
| 15850 | continue; | |||
| 15851 | ||||
| 15852 | bool Match = false; | |||
| 15853 | for (int j = 0, e = DWordPairs.size(); j < e; ++j) { | |||
| 15854 | auto &DWordPair = DWordPairs[j]; | |||
| 15855 | if ((M0 < 0 || isUndefOrEqual(DWordPair.first, M0)) && | |||
| 15856 | (M1 < 0 || isUndefOrEqual(DWordPair.second, M1))) { | |||
| 15857 | DWordPair.first = (M0 >= 0 ? M0 : DWordPair.first); | |||
| 15858 | DWordPair.second = (M1 >= 0 ? M1 : DWordPair.second); | |||
| 15859 | PSHUFDMask[DWord] = DOffset + j; | |||
| 15860 | Match = true; | |||
| 15861 | break; | |||
| 15862 | } | |||
| 15863 | } | |||
| 15864 | if (!Match) { | |||
| 15865 | PSHUFDMask[DWord] = DOffset + DWordPairs.size(); | |||
| 15866 | DWordPairs.push_back(std::make_pair(M0, M1)); | |||
| 15867 | } | |||
| 15868 | } | |||
| 15869 | ||||
| 15870 | if (DWordPairs.size() <= 2) { | |||
| 15871 | DWordPairs.resize(2, std::make_pair(-1, -1)); | |||
| 15872 | int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, | |||
| 15873 | DWordPairs[1].first, DWordPairs[1].second}; | |||
| 15874 | if ((NumHToL + NumHToH) == 0) | |||
| 15875 | return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); | |||
| 15876 | if ((NumLToL + NumLToH) == 0) | |||
| 15877 | return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFHW); | |||
| 15878 | } | |||
| 15879 | } | |||
| 15880 | ||||
| 15881 | // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all | |||
| 15882 | // such inputs we can swap two of the dwords across the half mark and end up | |||
| 15883 | // with <=2 inputs to each half in each half. Once there, we can fall through | |||
| 15884 | // to the generic code below. For example: | |||
| 15885 | // | |||
| 15886 | // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] | |||
| 15887 | // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] | |||
| 15888 | // | |||
| 15889 | // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half | |||
| 15890 | // and an existing 2-into-2 on the other half. In this case we may have to | |||
| 15891 | // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or | |||
| 15892 | // 1-into-3 which could cause us to cycle endlessly fixing each side in turn. | |||
| 15893 | // Fortunately, we don't have to handle anything but a 2-into-2 pattern | |||
| 15894 | // because any other situation (including a 3-into-1 or 1-into-3 in the other | |||
| 15895 | // half than the one we target for fixing) will be fixed when we re-enter this | |||
| 15896 | // path. We will also combine away any sequence of PSHUFD instructions that | |||
| 15897 | // result into a single instruction. Here is an example of the tricky case: | |||
| 15898 | // | |||
| 15899 | // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] | |||
| 15900 | // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3] | |||
| 15901 | // | |||
| 15902 | // This now has a 1-into-3 in the high half! Instead, we do two shuffles: | |||
| 15903 | // | |||
| 15904 | // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h] | |||
| 15905 | // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6] | |||
| 15906 | // | |||
| 15907 | // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h] | |||
| 15908 | // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6] | |||
| 15909 | // | |||
| 15910 | // The result is fine to be handled by the generic logic. | |||
| 15911 | auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs, | |||
| 15912 | ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs, | |||
| 15913 | int AOffset, int BOffset) { | |||
| 15914 | assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs .size() == 1) && "Must call this with A having 3 or 1 inputs from the A half." ) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__ __PRETTY_FUNCTION__)) | |||
| 15915 | "Must call this with A having 3 or 1 inputs from the A half.")(static_cast <bool> ((AToAInputs.size() == 3 || AToAInputs .size() == 1) && "Must call this with A having 3 or 1 inputs from the A half." ) ? void (0) : __assert_fail ("(AToAInputs.size() == 3 || AToAInputs.size() == 1) && \"Must call this with A having 3 or 1 inputs from the A half.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15915, __extension__ __PRETTY_FUNCTION__)); | |||
| 15916 | assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs .size() == 3) && "Must call this with B having 1 or 3 inputs from the B half." ) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__ __PRETTY_FUNCTION__)) | |||
| 15917 | "Must call this with B having 1 or 3 inputs from the B half.")(static_cast <bool> ((BToAInputs.size() == 1 || BToAInputs .size() == 3) && "Must call this with B having 1 or 3 inputs from the B half." ) ? void (0) : __assert_fail ("(BToAInputs.size() == 1 || BToAInputs.size() == 3) && \"Must call this with B having 1 or 3 inputs from the B half.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15917, __extension__ __PRETTY_FUNCTION__)); | |||
| 15918 | assert(AToAInputs.size() + BToAInputs.size() == 4 &&(static_cast <bool> (AToAInputs.size() + BToAInputs.size () == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)." ) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__ __PRETTY_FUNCTION__)) | |||
| 15919 | "Must call this with either 3:1 or 1:3 inputs (summing to 4).")(static_cast <bool> (AToAInputs.size() + BToAInputs.size () == 4 && "Must call this with either 3:1 or 1:3 inputs (summing to 4)." ) ? void (0) : __assert_fail ("AToAInputs.size() + BToAInputs.size() == 4 && \"Must call this with either 3:1 or 1:3 inputs (summing to 4).\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15919, __extension__ __PRETTY_FUNCTION__)); | |||
| 15920 | ||||
| 15921 | bool ThreeAInputs = AToAInputs.size() == 3; | |||
| 15922 | ||||
| 15923 | // Compute the index of dword with only one word among the three inputs in | |||
| 15924 | // a half by taking the sum of the half with three inputs and subtracting | |||
| 15925 | // the sum of the actual three inputs. The difference is the remaining | |||
| 15926 | // slot. | |||
| 15927 | int ADWord = 0, BDWord = 0; | |||
| 15928 | int &TripleDWord = ThreeAInputs ? ADWord : BDWord; | |||
| 15929 | int &OneInputDWord = ThreeAInputs ? BDWord : ADWord; | |||
| 15930 | int TripleInputOffset = ThreeAInputs ? AOffset : BOffset; | |||
| 15931 | ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs; | |||
| 15932 | int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0]; | |||
| 15933 | int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset); | |||
| 15934 | int TripleNonInputIdx = | |||
| 15935 | TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0); | |||
| 15936 | TripleDWord = TripleNonInputIdx / 2; | |||
| 15937 | ||||
| 15938 | // We use xor with one to compute the adjacent DWord to whichever one the | |||
| 15939 | // OneInput is in. | |||
| 15940 | OneInputDWord = (OneInput / 2) ^ 1; | |||
| 15941 | ||||
| 15942 | // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA | |||
| 15943 | // and BToA inputs. If there is also such a problem with the BToB and AToB | |||
| 15944 | // inputs, we don't try to fix it necessarily -- we'll recurse and see it in | |||
| 15945 | // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it | |||
| 15946 | // is essential that we don't *create* a 3<-1 as then we might oscillate. | |||
| 15947 | if (BToBInputs.size() == 2 && AToBInputs.size() == 2) { | |||
| 15948 | // Compute how many inputs will be flipped by swapping these DWords. We | |||
| 15949 | // need | |||
| 15950 | // to balance this to ensure we don't form a 3-1 shuffle in the other | |||
| 15951 | // half. | |||
| 15952 | int NumFlippedAToBInputs = llvm::count(AToBInputs, 2 * ADWord) + | |||
| 15953 | llvm::count(AToBInputs, 2 * ADWord + 1); | |||
| 15954 | int NumFlippedBToBInputs = llvm::count(BToBInputs, 2 * BDWord) + | |||
| 15955 | llvm::count(BToBInputs, 2 * BDWord + 1); | |||
| 15956 | if ((NumFlippedAToBInputs == 1 && | |||
| 15957 | (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) || | |||
| 15958 | (NumFlippedBToBInputs == 1 && | |||
| 15959 | (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) { | |||
| 15960 | // We choose whether to fix the A half or B half based on whether that | |||
| 15961 | // half has zero flipped inputs. At zero, we may not be able to fix it | |||
| 15962 | // with that half. We also bias towards fixing the B half because that | |||
| 15963 | // will more commonly be the high half, and we have to bias one way. | |||
| 15964 | auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord, | |||
| 15965 | ArrayRef<int> Inputs) { | |||
| 15966 | int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot. | |||
| 15967 | bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1); | |||
| 15968 | // Determine whether the free index is in the flipped dword or the | |||
| 15969 | // unflipped dword based on where the pinned index is. We use this bit | |||
| 15970 | // in an xor to conditionally select the adjacent dword. | |||
| 15971 | int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord)); | |||
| 15972 | bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); | |||
| 15973 | if (IsFixIdxInput == IsFixFreeIdxInput) | |||
| 15974 | FixFreeIdx += 1; | |||
| 15975 | IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx); | |||
| 15976 | assert(IsFixIdxInput != IsFixFreeIdxInput &&(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!" ) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__ __PRETTY_FUNCTION__)) | |||
| 15977 | "We need to be changing the number of flipped inputs!")(static_cast <bool> (IsFixIdxInput != IsFixFreeIdxInput && "We need to be changing the number of flipped inputs!" ) ? void (0) : __assert_fail ("IsFixIdxInput != IsFixFreeIdxInput && \"We need to be changing the number of flipped inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15977, __extension__ __PRETTY_FUNCTION__)); | |||
| 15978 | int PSHUFHalfMask[] = {0, 1, 2, 3}; | |||
| 15979 | std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]); | |||
| 15980 | V = DAG.getNode( | |||
| 15981 | FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL, | |||
| 15982 | MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V, | |||
| 15983 | getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG)); | |||
| 15984 | ||||
| 15985 | for (int &M : Mask) | |||
| 15986 | if (M >= 0 && M == FixIdx) | |||
| 15987 | M = FixFreeIdx; | |||
| 15988 | else if (M >= 0 && M == FixFreeIdx) | |||
| 15989 | M = FixIdx; | |||
| 15990 | }; | |||
| 15991 | if (NumFlippedBToBInputs != 0) { | |||
| 15992 | int BPinnedIdx = | |||
| 15993 | BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput; | |||
| 15994 | FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs); | |||
| 15995 | } else { | |||
| 15996 | assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!")(static_cast <bool> (NumFlippedAToBInputs != 0 && "Impossible given predicates!") ? void (0) : __assert_fail ( "NumFlippedAToBInputs != 0 && \"Impossible given predicates!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 15996, __extension__ __PRETTY_FUNCTION__)); | |||
| 15997 | int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput; | |||
| 15998 | FixFlippedInputs(APinnedIdx, ADWord, AToBInputs); | |||
| 15999 | } | |||
| 16000 | } | |||
| 16001 | } | |||
| 16002 | ||||
| 16003 | int PSHUFDMask[] = {0, 1, 2, 3}; | |||
| 16004 | PSHUFDMask[ADWord] = BDWord; | |||
| 16005 | PSHUFDMask[BDWord] = ADWord; | |||
| 16006 | V = DAG.getBitcast( | |||
| 16007 | VT, | |||
| 16008 | DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), | |||
| 16009 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); | |||
| 16010 | ||||
| 16011 | // Adjust the mask to match the new locations of A and B. | |||
| 16012 | for (int &M : Mask) | |||
| 16013 | if (M >= 0 && M/2 == ADWord) | |||
| 16014 | M = 2 * BDWord + M % 2; | |||
| 16015 | else if (M >= 0 && M/2 == BDWord) | |||
| 16016 | M = 2 * ADWord + M % 2; | |||
| 16017 | ||||
| 16018 | // Recurse back into this routine to re-compute state now that this isn't | |||
| 16019 | // a 3 and 1 problem. | |||
| 16020 | return lowerV8I16GeneralSingleInputShuffle(DL, VT, V, Mask, Subtarget, DAG); | |||
| 16021 | }; | |||
| 16022 | if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3)) | |||
| 16023 | return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4); | |||
| 16024 | if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3)) | |||
| 16025 | return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0); | |||
| 16026 | ||||
| 16027 | // At this point there are at most two inputs to the low and high halves from | |||
| 16028 | // each half. That means the inputs can always be grouped into dwords and | |||
| 16029 | // those dwords can then be moved to the correct half with a dword shuffle. | |||
| 16030 | // We use at most one low and one high word shuffle to collect these paired | |||
| 16031 | // inputs into dwords, and finally a dword shuffle to place them. | |||
| 16032 | int PSHUFLMask[4] = {-1, -1, -1, -1}; | |||
| 16033 | int PSHUFHMask[4] = {-1, -1, -1, -1}; | |||
| 16034 | int PSHUFDMask[4] = {-1, -1, -1, -1}; | |||
| 16035 | ||||
| 16036 | // First fix the masks for all the inputs that are staying in their | |||
| 16037 | // original halves. This will then dictate the targets of the cross-half | |||
| 16038 | // shuffles. | |||
| 16039 | auto fixInPlaceInputs = | |||
| 16040 | [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs, | |||
| 16041 | MutableArrayRef<int> SourceHalfMask, | |||
| 16042 | MutableArrayRef<int> HalfMask, int HalfOffset) { | |||
| 16043 | if (InPlaceInputs.empty()) | |||
| 16044 | return; | |||
| 16045 | if (InPlaceInputs.size() == 1) { | |||
| 16046 | SourceHalfMask[InPlaceInputs[0] - HalfOffset] = | |||
| 16047 | InPlaceInputs[0] - HalfOffset; | |||
| 16048 | PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; | |||
| 16049 | return; | |||
| 16050 | } | |||
| 16051 | if (IncomingInputs.empty()) { | |||
| 16052 | // Just fix all of the in place inputs. | |||
| 16053 | for (int Input : InPlaceInputs) { | |||
| 16054 | SourceHalfMask[Input - HalfOffset] = Input - HalfOffset; | |||
| 16055 | PSHUFDMask[Input / 2] = Input / 2; | |||
| 16056 | } | |||
| 16057 | return; | |||
| 16058 | } | |||
| 16059 | ||||
| 16060 | assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!")(static_cast <bool> (InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!") ? void (0) : __assert_fail ( "InPlaceInputs.size() == 2 && \"Cannot handle 3 or 4 inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16060, __extension__ __PRETTY_FUNCTION__)); | |||
| 16061 | SourceHalfMask[InPlaceInputs[0] - HalfOffset] = | |||
| 16062 | InPlaceInputs[0] - HalfOffset; | |||
| 16063 | // Put the second input next to the first so that they are packed into | |||
| 16064 | // a dword. We find the adjacent index by toggling the low bit. | |||
| 16065 | int AdjIndex = InPlaceInputs[0] ^ 1; | |||
| 16066 | SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; | |||
| 16067 | std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); | |||
| 16068 | PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; | |||
| 16069 | }; | |||
| 16070 | fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0); | |||
| 16071 | fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4); | |||
| 16072 | ||||
| 16073 | // Now gather the cross-half inputs and place them into a free dword of | |||
| 16074 | // their target half. | |||
| 16075 | // FIXME: This operation could almost certainly be simplified dramatically to | |||
| 16076 | // look more like the 3-1 fixing operation. | |||
| 16077 | auto moveInputsToRightHalf = [&PSHUFDMask]( | |||
| 16078 | MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs, | |||
| 16079 | MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask, | |||
| 16080 | MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset, | |||
| 16081 | int DestOffset) { | |||
| 16082 | auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) { | |||
| 16083 | return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word; | |||
| 16084 | }; | |||
| 16085 | auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask, | |||
| 16086 | int Word) { | |||
| 16087 | int LowWord = Word & ~1; | |||
| 16088 | int HighWord = Word | 1; | |||
| 16089 | return isWordClobbered(SourceHalfMask, LowWord) || | |||
| 16090 | isWordClobbered(SourceHalfMask, HighWord); | |||
| 16091 | }; | |||
| 16092 | ||||
| 16093 | if (IncomingInputs.empty()) | |||
| 16094 | return; | |||
| 16095 | ||||
| 16096 | if (ExistingInputs.empty()) { | |||
| 16097 | // Map any dwords with inputs from them into the right half. | |||
| 16098 | for (int Input : IncomingInputs) { | |||
| 16099 | // If the source half mask maps over the inputs, turn those into | |||
| 16100 | // swaps and use the swapped lane. | |||
| 16101 | if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { | |||
| 16102 | if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) { | |||
| 16103 | SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = | |||
| 16104 | Input - SourceOffset; | |||
| 16105 | // We have to swap the uses in our half mask in one sweep. | |||
| 16106 | for (int &M : HalfMask) | |||
| 16107 | if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset) | |||
| 16108 | M = Input; | |||
| 16109 | else if (M == Input) | |||
| 16110 | M = SourceHalfMask[Input - SourceOffset] + SourceOffset; | |||
| 16111 | } else { | |||
| 16112 | assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__ __PRETTY_FUNCTION__)) | |||
| 16113 | Input - SourceOffset &&(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__ __PRETTY_FUNCTION__)) | |||
| 16114 | "Previous placement doesn't match!")(static_cast <bool> (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16114, __extension__ __PRETTY_FUNCTION__)); | |||
| 16115 | } | |||
| 16116 | // Note that this correctly re-maps both when we do a swap and when | |||
| 16117 | // we observe the other side of the swap above. We rely on that to | |||
| 16118 | // avoid swapping the members of the input list directly. | |||
| 16119 | Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; | |||
| 16120 | } | |||
| 16121 | ||||
| 16122 | // Map the input's dword into the correct half. | |||
| 16123 | if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0) | |||
| 16124 | PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; | |||
| 16125 | else | |||
| 16126 | assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==(static_cast <bool> (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__ __PRETTY_FUNCTION__)) | |||
| 16127 | Input / 2 &&(static_cast <bool> (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__ __PRETTY_FUNCTION__)) | |||
| 16128 | "Previous placement doesn't match!")(static_cast <bool> (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!" ) ? void (0) : __assert_fail ("PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && \"Previous placement doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16128, __extension__ __PRETTY_FUNCTION__)); | |||
| 16129 | } | |||
| 16130 | ||||
| 16131 | // And just directly shift any other-half mask elements to be same-half | |||
| 16132 | // as we will have mirrored the dword containing the element into the | |||
| 16133 | // same position within that half. | |||
| 16134 | for (int &M : HalfMask) | |||
| 16135 | if (M >= SourceOffset && M < SourceOffset + 4) { | |||
| 16136 | M = M - SourceOffset + DestOffset; | |||
| 16137 | assert(M >= 0 && "This should never wrap below zero!")(static_cast <bool> (M >= 0 && "This should never wrap below zero!" ) ? void (0) : __assert_fail ("M >= 0 && \"This should never wrap below zero!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16137, __extension__ __PRETTY_FUNCTION__)); | |||
| 16138 | } | |||
| 16139 | return; | |||
| 16140 | } | |||
| 16141 | ||||
| 16142 | // Ensure we have the input in a viable dword of its current half. This | |||
| 16143 | // is particularly tricky because the original position may be clobbered | |||
| 16144 | // by inputs being moved and *staying* in that half. | |||
| 16145 | if (IncomingInputs.size() == 1) { | |||
| 16146 | if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { | |||
| 16147 | int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) + | |||
| 16148 | SourceOffset; | |||
| 16149 | SourceHalfMask[InputFixed - SourceOffset] = | |||
| 16150 | IncomingInputs[0] - SourceOffset; | |||
| 16151 | std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], | |||
| 16152 | InputFixed); | |||
| 16153 | IncomingInputs[0] = InputFixed; | |||
| 16154 | } | |||
| 16155 | } else if (IncomingInputs.size() == 2) { | |||
| 16156 | if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || | |||
| 16157 | isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { | |||
| 16158 | // We have two non-adjacent or clobbered inputs we need to extract from | |||
| 16159 | // the source half. To do this, we need to map them into some adjacent | |||
| 16160 | // dword slot in the source mask. | |||
| 16161 | int InputsFixed[2] = {IncomingInputs[0] - SourceOffset, | |||
| 16162 | IncomingInputs[1] - SourceOffset}; | |||
| 16163 | ||||
| 16164 | // If there is a free slot in the source half mask adjacent to one of | |||
| 16165 | // the inputs, place the other input in it. We use (Index XOR 1) to | |||
| 16166 | // compute an adjacent index. | |||
| 16167 | if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) && | |||
| 16168 | SourceHalfMask[InputsFixed[0] ^ 1] < 0) { | |||
| 16169 | SourceHalfMask[InputsFixed[0]] = InputsFixed[0]; | |||
| 16170 | SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; | |||
| 16171 | InputsFixed[1] = InputsFixed[0] ^ 1; | |||
| 16172 | } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) && | |||
| 16173 | SourceHalfMask[InputsFixed[1] ^ 1] < 0) { | |||
| 16174 | SourceHalfMask[InputsFixed[1]] = InputsFixed[1]; | |||
| 16175 | SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0]; | |||
| 16176 | InputsFixed[0] = InputsFixed[1] ^ 1; | |||
| 16177 | } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 && | |||
| 16178 | SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) { | |||
| 16179 | // The two inputs are in the same DWord but it is clobbered and the | |||
| 16180 | // adjacent DWord isn't used at all. Move both inputs to the free | |||
| 16181 | // slot. | |||
| 16182 | SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0]; | |||
| 16183 | SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1]; | |||
| 16184 | InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1); | |||
| 16185 | InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1; | |||
| 16186 | } else { | |||
| 16187 | // The only way we hit this point is if there is no clobbering | |||
| 16188 | // (because there are no off-half inputs to this half) and there is no | |||
| 16189 | // free slot adjacent to one of the inputs. In this case, we have to | |||
| 16190 | // swap an input with a non-input. | |||
| 16191 | for (int i = 0; i < 4; ++i) | |||
| 16192 | assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask [i] == i) && "We can't handle any clobbers here!") ? void (0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__ __PRETTY_FUNCTION__)) | |||
| 16193 | "We can't handle any clobbers here!")(static_cast <bool> ((SourceHalfMask[i] < 0 || SourceHalfMask [i] == i) && "We can't handle any clobbers here!") ? void (0) : __assert_fail ("(SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) && \"We can't handle any clobbers here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16193, __extension__ __PRETTY_FUNCTION__)); | |||
| 16194 | assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!") ? void (0 ) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__ __PRETTY_FUNCTION__)) | |||
| 16195 | "Cannot have adjacent inputs here!")(static_cast <bool> (InputsFixed[1] != (InputsFixed[0] ^ 1) && "Cannot have adjacent inputs here!") ? void (0 ) : __assert_fail ("InputsFixed[1] != (InputsFixed[0] ^ 1) && \"Cannot have adjacent inputs here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16195, __extension__ __PRETTY_FUNCTION__)); | |||
| 16196 | ||||
| 16197 | SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1]; | |||
| 16198 | SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1; | |||
| 16199 | ||||
| 16200 | // We also have to update the final source mask in this case because | |||
| 16201 | // it may need to undo the above swap. | |||
| 16202 | for (int &M : FinalSourceHalfMask) | |||
| 16203 | if (M == (InputsFixed[0] ^ 1) + SourceOffset) | |||
| 16204 | M = InputsFixed[1] + SourceOffset; | |||
| 16205 | else if (M == InputsFixed[1] + SourceOffset) | |||
| 16206 | M = (InputsFixed[0] ^ 1) + SourceOffset; | |||
| 16207 | ||||
| 16208 | InputsFixed[1] = InputsFixed[0] ^ 1; | |||
| 16209 | } | |||
| 16210 | ||||
| 16211 | // Point everything at the fixed inputs. | |||
| 16212 | for (int &M : HalfMask) | |||
| 16213 | if (M == IncomingInputs[0]) | |||
| 16214 | M = InputsFixed[0] + SourceOffset; | |||
| 16215 | else if (M == IncomingInputs[1]) | |||
| 16216 | M = InputsFixed[1] + SourceOffset; | |||
| 16217 | ||||
| 16218 | IncomingInputs[0] = InputsFixed[0] + SourceOffset; | |||
| 16219 | IncomingInputs[1] = InputsFixed[1] + SourceOffset; | |||
| 16220 | } | |||
| 16221 | } else { | |||
| 16222 | llvm_unreachable("Unhandled input size!")::llvm::llvm_unreachable_internal("Unhandled input size!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 16222); | |||
| 16223 | } | |||
| 16224 | ||||
| 16225 | // Now hoist the DWord down to the right half. | |||
| 16226 | int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2; | |||
| 16227 | assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free")(static_cast <bool> (PSHUFDMask[FreeDWord] < 0 && "DWord not free") ? void (0) : __assert_fail ("PSHUFDMask[FreeDWord] < 0 && \"DWord not free\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16227, __extension__ __PRETTY_FUNCTION__)); | |||
| 16228 | PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; | |||
| 16229 | for (int &M : HalfMask) | |||
| 16230 | for (int Input : IncomingInputs) | |||
| 16231 | if (M == Input) | |||
| 16232 | M = FreeDWord * 2 + Input % 2; | |||
| 16233 | }; | |||
| 16234 | moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask, | |||
| 16235 | /*SourceOffset*/ 4, /*DestOffset*/ 0); | |||
| 16236 | moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask, | |||
| 16237 | /*SourceOffset*/ 0, /*DestOffset*/ 4); | |||
| 16238 | ||||
| 16239 | // Now enact all the shuffles we've computed to move the inputs into their | |||
| 16240 | // target half. | |||
| 16241 | if (!isNoopShuffleMask(PSHUFLMask)) | |||
| 16242 | V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, | |||
| 16243 | getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG)); | |||
| 16244 | if (!isNoopShuffleMask(PSHUFHMask)) | |||
| 16245 | V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, | |||
| 16246 | getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG)); | |||
| 16247 | if (!isNoopShuffleMask(PSHUFDMask)) | |||
| 16248 | V = DAG.getBitcast( | |||
| 16249 | VT, | |||
| 16250 | DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V), | |||
| 16251 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); | |||
| 16252 | ||||
| 16253 | // At this point, each half should contain all its inputs, and we can then | |||
| 16254 | // just shuffle them into their final position. | |||
| 16255 | assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&(static_cast <bool> (count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!" ) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__ __PRETTY_FUNCTION__)) | |||
| 16256 | "Failed to lift all the high half inputs to the low mask!")(static_cast <bool> (count_if(LoMask, [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!" ) ? void (0) : __assert_fail ("count_if(LoMask, [](int M) { return M >= 4; }) == 0 && \"Failed to lift all the high half inputs to the low mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16256, __extension__ __PRETTY_FUNCTION__)); | |||
| 16257 | assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&(static_cast <bool> (count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!" ) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__ __PRETTY_FUNCTION__)) | |||
| 16258 | "Failed to lift all the low half inputs to the high mask!")(static_cast <bool> (count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!" ) ? void (0) : __assert_fail ("count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 && \"Failed to lift all the low half inputs to the high mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16258, __extension__ __PRETTY_FUNCTION__)); | |||
| 16259 | ||||
| 16260 | // Do a half shuffle for the low mask. | |||
| 16261 | if (!isNoopShuffleMask(LoMask)) | |||
| 16262 | V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V, | |||
| 16263 | getV4X86ShuffleImm8ForMask(LoMask, DL, DAG)); | |||
| 16264 | ||||
| 16265 | // Do a half shuffle with the high mask after shifting its values down. | |||
| 16266 | for (int &M : HiMask) | |||
| 16267 | if (M >= 0) | |||
| 16268 | M -= 4; | |||
| 16269 | if (!isNoopShuffleMask(HiMask)) | |||
| 16270 | V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V, | |||
| 16271 | getV4X86ShuffleImm8ForMask(HiMask, DL, DAG)); | |||
| 16272 | ||||
| 16273 | return V; | |||
| 16274 | } | |||
| 16275 | ||||
| 16276 | /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the | |||
| 16277 | /// blend if only one input is used. | |||
| 16278 | static SDValue lowerShuffleAsBlendOfPSHUFBs( | |||
| 16279 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 16280 | const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse, bool &V2InUse) { | |||
| 16281 | assert(!is128BitLaneCrossingShuffleMask(VT, Mask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT , Mask) && "Lane crossing shuffle masks not supported" ) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__ __PRETTY_FUNCTION__)) | |||
| 16282 | "Lane crossing shuffle masks not supported")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT , Mask) && "Lane crossing shuffle masks not supported" ) ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, Mask) && \"Lane crossing shuffle masks not supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16282, __extension__ __PRETTY_FUNCTION__)); | |||
| 16283 | ||||
| 16284 | int NumBytes = VT.getSizeInBits() / 8; | |||
| 16285 | int Size = Mask.size(); | |||
| 16286 | int Scale = NumBytes / Size; | |||
| 16287 | ||||
| 16288 | SmallVector<SDValue, 64> V1Mask(NumBytes, DAG.getUNDEF(MVT::i8)); | |||
| 16289 | SmallVector<SDValue, 64> V2Mask(NumBytes, DAG.getUNDEF(MVT::i8)); | |||
| 16290 | V1InUse = false; | |||
| 16291 | V2InUse = false; | |||
| 16292 | ||||
| 16293 | for (int i = 0; i < NumBytes; ++i) { | |||
| 16294 | int M = Mask[i / Scale]; | |||
| 16295 | if (M < 0) | |||
| 16296 | continue; | |||
| 16297 | ||||
| 16298 | const int ZeroMask = 0x80; | |||
| 16299 | int V1Idx = M < Size ? M * Scale + i % Scale : ZeroMask; | |||
| 16300 | int V2Idx = M < Size ? ZeroMask : (M - Size) * Scale + i % Scale; | |||
| 16301 | if (Zeroable[i / Scale]) | |||
| 16302 | V1Idx = V2Idx = ZeroMask; | |||
| 16303 | ||||
| 16304 | V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8); | |||
| 16305 | V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8); | |||
| 16306 | V1InUse |= (ZeroMask != V1Idx); | |||
| 16307 | V2InUse |= (ZeroMask != V2Idx); | |||
| 16308 | } | |||
| 16309 | ||||
| 16310 | MVT ShufVT = MVT::getVectorVT(MVT::i8, NumBytes); | |||
| 16311 | if (V1InUse) | |||
| 16312 | V1 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V1), | |||
| 16313 | DAG.getBuildVector(ShufVT, DL, V1Mask)); | |||
| 16314 | if (V2InUse) | |||
| 16315 | V2 = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, DAG.getBitcast(ShufVT, V2), | |||
| 16316 | DAG.getBuildVector(ShufVT, DL, V2Mask)); | |||
| 16317 | ||||
| 16318 | // If we need shuffled inputs from both, blend the two. | |||
| 16319 | SDValue V; | |||
| 16320 | if (V1InUse && V2InUse) | |||
| 16321 | V = DAG.getNode(ISD::OR, DL, ShufVT, V1, V2); | |||
| 16322 | else | |||
| 16323 | V = V1InUse ? V1 : V2; | |||
| 16324 | ||||
| 16325 | // Cast the result back to the correct type. | |||
| 16326 | return DAG.getBitcast(VT, V); | |||
| 16327 | } | |||
| 16328 | ||||
| 16329 | /// Generic lowering of 8-lane i16 shuffles. | |||
| 16330 | /// | |||
| 16331 | /// This handles both single-input shuffles and combined shuffle/blends with | |||
| 16332 | /// two inputs. The single input shuffles are immediately delegated to | |||
| 16333 | /// a dedicated lowering routine. | |||
| 16334 | /// | |||
| 16335 | /// The blends are lowered in one of three fundamental ways. If there are few | |||
| 16336 | /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle | |||
| 16337 | /// of the input is significantly cheaper when lowered as an interleaving of | |||
| 16338 | /// the two inputs, try to interleave them. Otherwise, blend the low and high | |||
| 16339 | /// halves of the inputs separately (making them have relatively few inputs) | |||
| 16340 | /// and then concatenate them. | |||
| 16341 | static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 16342 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 16343 | const X86Subtarget &Subtarget, | |||
| 16344 | SelectionDAG &DAG) { | |||
| 16345 | assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16345, __extension__ __PRETTY_FUNCTION__)); | |||
| 16346 | assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16346, __extension__ __PRETTY_FUNCTION__)); | |||
| 16347 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16347, __extension__ __PRETTY_FUNCTION__)); | |||
| 16348 | ||||
| 16349 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 16350 | // than any alternative. | |||
| 16351 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16352 | Zeroable, Subtarget, DAG)) | |||
| 16353 | return ZExt; | |||
| 16354 | ||||
| 16355 | // Try to use lower using a truncation. | |||
| 16356 | if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v8i16, V1, V2, Mask, Zeroable, | |||
| 16357 | Subtarget, DAG)) | |||
| 16358 | return V; | |||
| 16359 | ||||
| 16360 | int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; }); | |||
| 16361 | ||||
| 16362 | if (NumV2Inputs == 0) { | |||
| 16363 | // Try to use shift instructions. | |||
| 16364 | if (SDValue Shift = | |||
| 16365 | lowerShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask, Zeroable, | |||
| 16366 | Subtarget, DAG, /*BitwiseOnly*/ false)) | |||
| 16367 | return Shift; | |||
| 16368 | ||||
| 16369 | // Check for being able to broadcast a single element. | |||
| 16370 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i16, V1, V2, | |||
| 16371 | Mask, Subtarget, DAG)) | |||
| 16372 | return Broadcast; | |||
| 16373 | ||||
| 16374 | // Try to use bit rotation instructions. | |||
| 16375 | if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask, | |||
| 16376 | Subtarget, DAG)) | |||
| 16377 | return Rotate; | |||
| 16378 | ||||
| 16379 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 16380 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) | |||
| 16381 | return V; | |||
| 16382 | ||||
| 16383 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 16384 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, | |||
| 16385 | Subtarget)) | |||
| 16386 | return V; | |||
| 16387 | ||||
| 16388 | // Try to use byte rotation instructions. | |||
| 16389 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, | |||
| 16390 | Subtarget, DAG)) | |||
| 16391 | return Rotate; | |||
| 16392 | ||||
| 16393 | // Make a copy of the mask so it can be modified. | |||
| 16394 | SmallVector<int, 8> MutableMask(Mask); | |||
| 16395 | return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v8i16, V1, MutableMask, | |||
| 16396 | Subtarget, DAG); | |||
| 16397 | } | |||
| 16398 | ||||
| 16399 | assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&(static_cast <bool> (llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__ __PRETTY_FUNCTION__)) | |||
| 16400 | "All single-input shuffles should be canonicalized to be V1-input "(static_cast <bool> (llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__ __PRETTY_FUNCTION__)) | |||
| 16401 | "shuffles.")(static_cast <bool> (llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && "All single-input shuffles should be canonicalized to be V1-input " "shuffles.") ? void (0) : __assert_fail ("llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) && \"All single-input shuffles should be canonicalized to be V1-input \" \"shuffles.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16401, __extension__ __PRETTY_FUNCTION__)); | |||
| 16402 | ||||
| 16403 | // Try to use shift instructions. | |||
| 16404 | if (SDValue Shift = | |||
| 16405 | lowerShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, | |||
| 16406 | DAG, /*BitwiseOnly*/ false)) | |||
| 16407 | return Shift; | |||
| 16408 | ||||
| 16409 | // See if we can use SSE4A Extraction / Insertion. | |||
| 16410 | if (Subtarget.hasSSE4A()) | |||
| 16411 | if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16412 | Zeroable, DAG)) | |||
| 16413 | return V; | |||
| 16414 | ||||
| 16415 | // There are special ways we can lower some single-element blends. | |||
| 16416 | if (NumV2Inputs == 1) | |||
| 16417 | if (SDValue V = lowerShuffleAsElementInsertion( | |||
| 16418 | DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 16419 | return V; | |||
| 16420 | ||||
| 16421 | // We have different paths for blend lowering, but they all must use the | |||
| 16422 | // *exact* same predicate. | |||
| 16423 | bool IsBlendSupported = Subtarget.hasSSE41(); | |||
| 16424 | if (IsBlendSupported) | |||
| 16425 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16426 | Zeroable, Subtarget, DAG)) | |||
| 16427 | return Blend; | |||
| 16428 | ||||
| 16429 | if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16430 | Zeroable, Subtarget, DAG)) | |||
| 16431 | return Masked; | |||
| 16432 | ||||
| 16433 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 16434 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) | |||
| 16435 | return V; | |||
| 16436 | ||||
| 16437 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 16438 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, | |||
| 16439 | Subtarget)) | |||
| 16440 | return V; | |||
| 16441 | ||||
| 16442 | // Try to use lower using a truncation. | |||
| 16443 | if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v8i16, V1, V2, Mask, Zeroable, | |||
| 16444 | Subtarget, DAG)) | |||
| 16445 | return V; | |||
| 16446 | ||||
| 16447 | // Try to use byte rotation instructions. | |||
| 16448 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16449 | Subtarget, DAG)) | |||
| 16450 | return Rotate; | |||
| 16451 | ||||
| 16452 | if (SDValue BitBlend = | |||
| 16453 | lowerShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG)) | |||
| 16454 | return BitBlend; | |||
| 16455 | ||||
| 16456 | // Try to use byte shift instructions to mask. | |||
| 16457 | if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16458 | Zeroable, Subtarget, DAG)) | |||
| 16459 | return V; | |||
| 16460 | ||||
| 16461 | // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW. | |||
| 16462 | // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to | |||
| 16463 | // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain. | |||
| 16464 | int NumEvenDrops = canLowerByDroppingElements(Mask, true, false); | |||
| 16465 | if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() && | |||
| 16466 | !Subtarget.hasVLX()) { | |||
| 16467 | // Check if this is part of a 256-bit vector truncation. | |||
| 16468 | if (NumEvenDrops == 2 && Subtarget.hasAVX2() && | |||
| 16469 | peekThroughBitcasts(V1).getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 16470 | peekThroughBitcasts(V2).getOpcode() == ISD::EXTRACT_SUBVECTOR) { | |||
| 16471 | SDValue V1V2 = concatSubVectors(V1, V2, DAG, DL); | |||
| 16472 | V1V2 = DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1V2, | |||
| 16473 | getZeroVector(MVT::v16i16, Subtarget, DAG, DL), | |||
| 16474 | DAG.getTargetConstant(0xEE, DL, MVT::i8)); | |||
| 16475 | V1V2 = DAG.getBitcast(MVT::v8i32, V1V2); | |||
| 16476 | V1 = extract128BitVector(V1V2, 0, DAG, DL); | |||
| 16477 | V2 = extract128BitVector(V1V2, 4, DAG, DL); | |||
| 16478 | } else { | |||
| 16479 | SmallVector<SDValue, 4> DWordClearOps(4, | |||
| 16480 | DAG.getConstant(0, DL, MVT::i32)); | |||
| 16481 | for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1)) | |||
| 16482 | DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32); | |||
| 16483 | SDValue DWordClearMask = | |||
| 16484 | DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps); | |||
| 16485 | V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1), | |||
| 16486 | DWordClearMask); | |||
| 16487 | V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2), | |||
| 16488 | DWordClearMask); | |||
| 16489 | } | |||
| 16490 | // Now pack things back together. | |||
| 16491 | SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2); | |||
| 16492 | if (NumEvenDrops == 2) { | |||
| 16493 | Result = DAG.getBitcast(MVT::v4i32, Result); | |||
| 16494 | Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result); | |||
| 16495 | } | |||
| 16496 | return Result; | |||
| 16497 | } | |||
| 16498 | ||||
| 16499 | // When compacting odd (upper) elements, use PACKSS pre-SSE41. | |||
| 16500 | int NumOddDrops = canLowerByDroppingElements(Mask, false, false); | |||
| 16501 | if (NumOddDrops == 1) { | |||
| 16502 | bool HasSSE41 = Subtarget.hasSSE41(); | |||
| 16503 | V1 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, | |||
| 16504 | DAG.getBitcast(MVT::v4i32, V1), | |||
| 16505 | DAG.getTargetConstant(16, DL, MVT::i8)); | |||
| 16506 | V2 = DAG.getNode(HasSSE41 ? X86ISD::VSRLI : X86ISD::VSRAI, DL, MVT::v4i32, | |||
| 16507 | DAG.getBitcast(MVT::v4i32, V2), | |||
| 16508 | DAG.getTargetConstant(16, DL, MVT::i8)); | |||
| 16509 | return DAG.getNode(HasSSE41 ? X86ISD::PACKUS : X86ISD::PACKSS, DL, | |||
| 16510 | MVT::v8i16, V1, V2); | |||
| 16511 | } | |||
| 16512 | ||||
| 16513 | // Try to lower by permuting the inputs into an unpack instruction. | |||
| 16514 | if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2, | |||
| 16515 | Mask, Subtarget, DAG)) | |||
| 16516 | return Unpack; | |||
| 16517 | ||||
| 16518 | // If we can't directly blend but can use PSHUFB, that will be better as it | |||
| 16519 | // can both shuffle and set up the inefficient blend. | |||
| 16520 | if (!IsBlendSupported && Subtarget.hasSSSE3()) { | |||
| 16521 | bool V1InUse, V2InUse; | |||
| 16522 | return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask, | |||
| 16523 | Zeroable, DAG, V1InUse, V2InUse); | |||
| 16524 | } | |||
| 16525 | ||||
| 16526 | // We can always bit-blend if we have to so the fallback strategy is to | |||
| 16527 | // decompose into single-input permutes and blends/unpacks. | |||
| 16528 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i16, V1, V2, | |||
| 16529 | Mask, Subtarget, DAG); | |||
| 16530 | } | |||
| 16531 | ||||
| 16532 | /// Lower 8-lane 16-bit floating point shuffles. | |||
| 16533 | static SDValue lowerV8F16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 16534 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 16535 | const X86Subtarget &Subtarget, | |||
| 16536 | SelectionDAG &DAG) { | |||
| 16537 | assert(V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f16 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16537, __extension__ __PRETTY_FUNCTION__)); | |||
| 16538 | assert(V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f16 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8f16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16538, __extension__ __PRETTY_FUNCTION__)); | |||
| 16539 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16539, __extension__ __PRETTY_FUNCTION__)); | |||
| 16540 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); | |||
| 16541 | ||||
| 16542 | if (Subtarget.hasFP16()) { | |||
| 16543 | if (NumV2Elements == 0) { | |||
| 16544 | // Check for being able to broadcast a single element. | |||
| 16545 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f16, V1, V2, | |||
| 16546 | Mask, Subtarget, DAG)) | |||
| 16547 | return Broadcast; | |||
| 16548 | } | |||
| 16549 | if (NumV2Elements == 1 && Mask[0] >= 8) | |||
| 16550 | if (SDValue V = lowerShuffleAsElementInsertion( | |||
| 16551 | DL, MVT::v8f16, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 16552 | return V; | |||
| 16553 | } | |||
| 16554 | ||||
| 16555 | V1 = DAG.getBitcast(MVT::v8i16, V1); | |||
| 16556 | V2 = DAG.getBitcast(MVT::v8i16, V2); | |||
| 16557 | return DAG.getBitcast(MVT::v8f16, | |||
| 16558 | DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, Mask)); | |||
| 16559 | } | |||
| 16560 | ||||
| 16561 | // Lowers unary/binary shuffle as VPERMV/VPERMV3, for non-VLX targets, | |||
| 16562 | // sub-512-bit shuffles are padded to 512-bits for the shuffle and then | |||
| 16563 | // the active subvector is extracted. | |||
| 16564 | static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT, | |||
| 16565 | ArrayRef<int> Mask, SDValue V1, SDValue V2, | |||
| 16566 | const X86Subtarget &Subtarget, | |||
| 16567 | SelectionDAG &DAG) { | |||
| 16568 | MVT MaskVT = VT.changeTypeToInteger(); | |||
| 16569 | SDValue MaskNode; | |||
| 16570 | MVT ShuffleVT = VT; | |||
| 16571 | if (!VT.is512BitVector() && !Subtarget.hasVLX()) { | |||
| 16572 | V1 = widenSubVector(V1, false, Subtarget, DAG, DL, 512); | |||
| 16573 | V2 = widenSubVector(V2, false, Subtarget, DAG, DL, 512); | |||
| 16574 | ShuffleVT = V1.getSimpleValueType(); | |||
| 16575 | ||||
| 16576 | // Adjust mask to correct indices for the second input. | |||
| 16577 | int NumElts = VT.getVectorNumElements(); | |||
| 16578 | unsigned Scale = 512 / VT.getSizeInBits(); | |||
| 16579 | SmallVector<int, 32> AdjustedMask(Mask); | |||
| 16580 | for (int &M : AdjustedMask) | |||
| 16581 | if (NumElts <= M) | |||
| 16582 | M += (Scale - 1) * NumElts; | |||
| 16583 | MaskNode = getConstVector(AdjustedMask, MaskVT, DAG, DL, true); | |||
| 16584 | MaskNode = widenSubVector(MaskNode, false, Subtarget, DAG, DL, 512); | |||
| 16585 | } else { | |||
| 16586 | MaskNode = getConstVector(Mask, MaskVT, DAG, DL, true); | |||
| 16587 | } | |||
| 16588 | ||||
| 16589 | SDValue Result; | |||
| 16590 | if (V2.isUndef()) | |||
| 16591 | Result = DAG.getNode(X86ISD::VPERMV, DL, ShuffleVT, MaskNode, V1); | |||
| 16592 | else | |||
| 16593 | Result = DAG.getNode(X86ISD::VPERMV3, DL, ShuffleVT, V1, MaskNode, V2); | |||
| 16594 | ||||
| 16595 | if (VT != ShuffleVT) | |||
| 16596 | Result = extractSubVector(Result, 0, DAG, DL, VT.getSizeInBits()); | |||
| 16597 | ||||
| 16598 | return Result; | |||
| 16599 | } | |||
| 16600 | ||||
| 16601 | /// Generic lowering of v16i8 shuffles. | |||
| 16602 | /// | |||
| 16603 | /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to | |||
| 16604 | /// detect any complexity reducing interleaving. If that doesn't help, it uses | |||
| 16605 | /// UNPCK to spread the i8 elements across two i16-element vectors, and uses | |||
| 16606 | /// the existing lowering for v8i16 blends on each half, finally PACK-ing them | |||
| 16607 | /// back together. | |||
| 16608 | static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 16609 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 16610 | const X86Subtarget &Subtarget, | |||
| 16611 | SelectionDAG &DAG) { | |||
| 16612 | assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16612, __extension__ __PRETTY_FUNCTION__)); | |||
| 16613 | assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v16i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16613, __extension__ __PRETTY_FUNCTION__)); | |||
| 16614 | assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16614, __extension__ __PRETTY_FUNCTION__)); | |||
| 16615 | ||||
| 16616 | // Try to use shift instructions. | |||
| 16617 | if (SDValue Shift = | |||
| 16618 | lowerShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, | |||
| 16619 | DAG, /*BitwiseOnly*/ false)) | |||
| 16620 | return Shift; | |||
| 16621 | ||||
| 16622 | // Try to use byte rotation instructions. | |||
| 16623 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16624 | Subtarget, DAG)) | |||
| 16625 | return Rotate; | |||
| 16626 | ||||
| 16627 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 16628 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, | |||
| 16629 | Subtarget)) | |||
| 16630 | return V; | |||
| 16631 | ||||
| 16632 | // Try to use a zext lowering. | |||
| 16633 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16634 | Zeroable, Subtarget, DAG)) | |||
| 16635 | return ZExt; | |||
| 16636 | ||||
| 16637 | // Try to use lower using a truncation. | |||
| 16638 | if (SDValue V = lowerShuffleWithVPMOV(DL, MVT::v16i8, V1, V2, Mask, Zeroable, | |||
| 16639 | Subtarget, DAG)) | |||
| 16640 | return V; | |||
| 16641 | ||||
| 16642 | if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i8, V1, V2, Mask, Zeroable, | |||
| 16643 | Subtarget, DAG)) | |||
| 16644 | return V; | |||
| 16645 | ||||
| 16646 | // See if we can use SSE4A Extraction / Insertion. | |||
| 16647 | if (Subtarget.hasSSE4A()) | |||
| 16648 | if (SDValue V = lowerShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16649 | Zeroable, DAG)) | |||
| 16650 | return V; | |||
| 16651 | ||||
| 16652 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); | |||
| 16653 | ||||
| 16654 | // For single-input shuffles, there are some nicer lowering tricks we can use. | |||
| 16655 | if (NumV2Elements == 0) { | |||
| 16656 | // Check for being able to broadcast a single element. | |||
| 16657 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i8, V1, V2, | |||
| 16658 | Mask, Subtarget, DAG)) | |||
| 16659 | return Broadcast; | |||
| 16660 | ||||
| 16661 | // Try to use bit rotation instructions. | |||
| 16662 | if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask, | |||
| 16663 | Subtarget, DAG)) | |||
| 16664 | return Rotate; | |||
| 16665 | ||||
| 16666 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) | |||
| 16667 | return V; | |||
| 16668 | ||||
| 16669 | // Check whether we can widen this to an i16 shuffle by duplicating bytes. | |||
| 16670 | // Notably, this handles splat and partial-splat shuffles more efficiently. | |||
| 16671 | // However, it only makes sense if the pre-duplication shuffle simplifies | |||
| 16672 | // things significantly. Currently, this means we need to be able to | |||
| 16673 | // express the pre-duplication shuffle as an i16 shuffle. | |||
| 16674 | // | |||
| 16675 | // FIXME: We should check for other patterns which can be widened into an | |||
| 16676 | // i16 shuffle as well. | |||
| 16677 | auto canWidenViaDuplication = [](ArrayRef<int> Mask) { | |||
| 16678 | for (int i = 0; i < 16; i += 2) | |||
| 16679 | if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1]) | |||
| 16680 | return false; | |||
| 16681 | ||||
| 16682 | return true; | |||
| 16683 | }; | |||
| 16684 | auto tryToWidenViaDuplication = [&]() -> SDValue { | |||
| 16685 | if (!canWidenViaDuplication(Mask)) | |||
| 16686 | return SDValue(); | |||
| 16687 | SmallVector<int, 4> LoInputs; | |||
| 16688 | copy_if(Mask, std::back_inserter(LoInputs), | |||
| 16689 | [](int M) { return M >= 0 && M < 8; }); | |||
| 16690 | array_pod_sort(LoInputs.begin(), LoInputs.end()); | |||
| 16691 | LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), | |||
| 16692 | LoInputs.end()); | |||
| 16693 | SmallVector<int, 4> HiInputs; | |||
| 16694 | copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; }); | |||
| 16695 | array_pod_sort(HiInputs.begin(), HiInputs.end()); | |||
| 16696 | HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), | |||
| 16697 | HiInputs.end()); | |||
| 16698 | ||||
| 16699 | bool TargetLo = LoInputs.size() >= HiInputs.size(); | |||
| 16700 | ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs; | |||
| 16701 | ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs; | |||
| 16702 | ||||
| 16703 | int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; | |||
| 16704 | SmallDenseMap<int, int, 8> LaneMap; | |||
| 16705 | for (int I : InPlaceInputs) { | |||
| 16706 | PreDupI16Shuffle[I/2] = I/2; | |||
| 16707 | LaneMap[I] = I; | |||
| 16708 | } | |||
| 16709 | int j = TargetLo ? 0 : 4, je = j + 4; | |||
| 16710 | for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { | |||
| 16711 | // Check if j is already a shuffle of this input. This happens when | |||
| 16712 | // there are two adjacent bytes after we move the low one. | |||
| 16713 | if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { | |||
| 16714 | // If we haven't yet mapped the input, search for a slot into which | |||
| 16715 | // we can map it. | |||
| 16716 | while (j < je && PreDupI16Shuffle[j] >= 0) | |||
| 16717 | ++j; | |||
| 16718 | ||||
| 16719 | if (j == je) | |||
| 16720 | // We can't place the inputs into a single half with a simple i16 shuffle, so bail. | |||
| 16721 | return SDValue(); | |||
| 16722 | ||||
| 16723 | // Map this input with the i16 shuffle. | |||
| 16724 | PreDupI16Shuffle[j] = MovingInputs[i] / 2; | |||
| 16725 | } | |||
| 16726 | ||||
| 16727 | // Update the lane map based on the mapping we ended up with. | |||
| 16728 | LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; | |||
| 16729 | } | |||
| 16730 | V1 = DAG.getBitcast( | |||
| 16731 | MVT::v16i8, | |||
| 16732 | DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), | |||
| 16733 | DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); | |||
| 16734 | ||||
| 16735 | // Unpack the bytes to form the i16s that will be shuffled into place. | |||
| 16736 | bool EvenInUse = false, OddInUse = false; | |||
| 16737 | for (int i = 0; i < 16; i += 2) { | |||
| 16738 | EvenInUse |= (Mask[i + 0] >= 0); | |||
| 16739 | OddInUse |= (Mask[i + 1] >= 0); | |||
| 16740 | if (EvenInUse && OddInUse) | |||
| 16741 | break; | |||
| 16742 | } | |||
| 16743 | V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, | |||
| 16744 | MVT::v16i8, EvenInUse ? V1 : DAG.getUNDEF(MVT::v16i8), | |||
| 16745 | OddInUse ? V1 : DAG.getUNDEF(MVT::v16i8)); | |||
| 16746 | ||||
| 16747 | int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; | |||
| 16748 | for (int i = 0; i < 16; ++i) | |||
| 16749 | if (Mask[i] >= 0) { | |||
| 16750 | int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); | |||
| 16751 | assert(MappedMask < 8 && "Invalid v8 shuffle mask!")(static_cast <bool> (MappedMask < 8 && "Invalid v8 shuffle mask!" ) ? void (0) : __assert_fail ("MappedMask < 8 && \"Invalid v8 shuffle mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16751, __extension__ __PRETTY_FUNCTION__)); | |||
| 16752 | if (PostDupI16Shuffle[i / 2] < 0) | |||
| 16753 | PostDupI16Shuffle[i / 2] = MappedMask; | |||
| 16754 | else | |||
| 16755 | assert(PostDupI16Shuffle[i / 2] == MappedMask &&(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!") ? void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__ __PRETTY_FUNCTION__)) | |||
| 16756 | "Conflicting entries in the original shuffle!")(static_cast <bool> (PostDupI16Shuffle[i / 2] == MappedMask && "Conflicting entries in the original shuffle!") ? void (0) : __assert_fail ("PostDupI16Shuffle[i / 2] == MappedMask && \"Conflicting entries in the original shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16756, __extension__ __PRETTY_FUNCTION__)); | |||
| 16757 | } | |||
| 16758 | return DAG.getBitcast( | |||
| 16759 | MVT::v16i8, | |||
| 16760 | DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1), | |||
| 16761 | DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); | |||
| 16762 | }; | |||
| 16763 | if (SDValue V = tryToWidenViaDuplication()) | |||
| 16764 | return V; | |||
| 16765 | } | |||
| 16766 | ||||
| 16767 | if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16768 | Zeroable, Subtarget, DAG)) | |||
| 16769 | return Masked; | |||
| 16770 | ||||
| 16771 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 16772 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG)) | |||
| 16773 | return V; | |||
| 16774 | ||||
| 16775 | // Try to use byte shift instructions to mask. | |||
| 16776 | if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16777 | Zeroable, Subtarget, DAG)) | |||
| 16778 | return V; | |||
| 16779 | ||||
| 16780 | // Check for compaction patterns. | |||
| 16781 | bool IsSingleInput = V2.isUndef(); | |||
| 16782 | int NumEvenDrops = canLowerByDroppingElements(Mask, true, IsSingleInput); | |||
| 16783 | ||||
| 16784 | // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly | |||
| 16785 | // with PSHUFB. It is important to do this before we attempt to generate any | |||
| 16786 | // blends but after all of the single-input lowerings. If the single input | |||
| 16787 | // lowerings can find an instruction sequence that is faster than a PSHUFB, we | |||
| 16788 | // want to preserve that and we can DAG combine any longer sequences into | |||
| 16789 | // a PSHUFB in the end. But once we start blending from multiple inputs, | |||
| 16790 | // the complexity of DAG combining bad patterns back into PSHUFB is too high, | |||
| 16791 | // and there are *very* few patterns that would actually be faster than the | |||
| 16792 | // PSHUFB approach because of its ability to zero lanes. | |||
| 16793 | // | |||
| 16794 | // If the mask is a binary compaction, we can more efficiently perform this | |||
| 16795 | // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()). | |||
| 16796 | // | |||
| 16797 | // FIXME: The only exceptions to the above are blends which are exact | |||
| 16798 | // interleavings with direct instructions supporting them. We currently don't | |||
| 16799 | // handle those well here. | |||
| 16800 | if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) { | |||
| 16801 | bool V1InUse = false; | |||
| 16802 | bool V2InUse = false; | |||
| 16803 | ||||
| 16804 | SDValue PSHUFB = lowerShuffleAsBlendOfPSHUFBs( | |||
| 16805 | DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse); | |||
| 16806 | ||||
| 16807 | // If both V1 and V2 are in use and we can use a direct blend or an unpack, | |||
| 16808 | // do so. This avoids using them to handle blends-with-zero which is | |||
| 16809 | // important as a single pshufb is significantly faster for that. | |||
| 16810 | if (V1InUse && V2InUse) { | |||
| 16811 | if (Subtarget.hasSSE41()) | |||
| 16812 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16813 | Zeroable, Subtarget, DAG)) | |||
| 16814 | return Blend; | |||
| 16815 | ||||
| 16816 | // We can use an unpack to do the blending rather than an or in some | |||
| 16817 | // cases. Even though the or may be (very minorly) more efficient, we | |||
| 16818 | // preference this lowering because there are common cases where part of | |||
| 16819 | // the complexity of the shuffles goes away when we do the final blend as | |||
| 16820 | // an unpack. | |||
| 16821 | // FIXME: It might be worth trying to detect if the unpack-feeding | |||
| 16822 | // shuffles will both be pshufb, in which case we shouldn't bother with | |||
| 16823 | // this. | |||
| 16824 | if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack( | |||
| 16825 | DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 16826 | return Unpack; | |||
| 16827 | ||||
| 16828 | // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). | |||
| 16829 | if (Subtarget.hasVBMI()) | |||
| 16830 | return lowerShuffleWithPERMV(DL, MVT::v16i8, Mask, V1, V2, Subtarget, | |||
| 16831 | DAG); | |||
| 16832 | ||||
| 16833 | // If we have XOP we can use one VPPERM instead of multiple PSHUFBs. | |||
| 16834 | if (Subtarget.hasXOP()) { | |||
| 16835 | SDValue MaskNode = getConstVector(Mask, MVT::v16i8, DAG, DL, true); | |||
| 16836 | return DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, V1, V2, MaskNode); | |||
| 16837 | } | |||
| 16838 | ||||
| 16839 | // Use PALIGNR+Permute if possible - permute might become PSHUFB but the | |||
| 16840 | // PALIGNR will be cheaper than the second PSHUFB+OR. | |||
| 16841 | if (SDValue V = lowerShuffleAsByteRotateAndPermute( | |||
| 16842 | DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 16843 | return V; | |||
| 16844 | } | |||
| 16845 | ||||
| 16846 | return PSHUFB; | |||
| 16847 | } | |||
| 16848 | ||||
| 16849 | // There are special ways we can lower some single-element blends. | |||
| 16850 | if (NumV2Elements == 1) | |||
| 16851 | if (SDValue V = lowerShuffleAsElementInsertion( | |||
| 16852 | DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 16853 | return V; | |||
| 16854 | ||||
| 16855 | if (SDValue Blend = lowerShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG)) | |||
| 16856 | return Blend; | |||
| 16857 | ||||
| 16858 | // Check whether a compaction lowering can be done. This handles shuffles | |||
| 16859 | // which take every Nth element for some even N. See the helper function for | |||
| 16860 | // details. | |||
| 16861 | // | |||
| 16862 | // We special case these as they can be particularly efficiently handled with | |||
| 16863 | // the PACKUSB instruction on x86 and they show up in common patterns of | |||
| 16864 | // rearranging bytes to truncate wide elements. | |||
| 16865 | if (NumEvenDrops) { | |||
| 16866 | // NumEvenDrops is the power of two stride of the elements. Another way of | |||
| 16867 | // thinking about it is that we need to drop the even elements this many | |||
| 16868 | // times to get the original input. | |||
| 16869 | ||||
| 16870 | // First we need to zero all the dropped bytes. | |||
| 16871 | assert(NumEvenDrops <= 3 &&(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times." ) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__ __PRETTY_FUNCTION__)) | |||
| 16872 | "No support for dropping even elements more than 3 times.")(static_cast <bool> (NumEvenDrops <= 3 && "No support for dropping even elements more than 3 times." ) ? void (0) : __assert_fail ("NumEvenDrops <= 3 && \"No support for dropping even elements more than 3 times.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16872, __extension__ __PRETTY_FUNCTION__)); | |||
| 16873 | SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16)); | |||
| 16874 | for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1)) | |||
| 16875 | WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16); | |||
| 16876 | SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps); | |||
| 16877 | V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1), | |||
| 16878 | WordClearMask); | |||
| 16879 | if (!IsSingleInput) | |||
| 16880 | V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2), | |||
| 16881 | WordClearMask); | |||
| 16882 | ||||
| 16883 | // Now pack things back together. | |||
| 16884 | SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, | |||
| 16885 | IsSingleInput ? V1 : V2); | |||
| 16886 | for (int i = 1; i < NumEvenDrops; ++i) { | |||
| 16887 | Result = DAG.getBitcast(MVT::v8i16, Result); | |||
| 16888 | Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result); | |||
| 16889 | } | |||
| 16890 | return Result; | |||
| 16891 | } | |||
| 16892 | ||||
| 16893 | int NumOddDrops = canLowerByDroppingElements(Mask, false, IsSingleInput); | |||
| 16894 | if (NumOddDrops == 1) { | |||
| 16895 | V1 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, | |||
| 16896 | DAG.getBitcast(MVT::v8i16, V1), | |||
| 16897 | DAG.getTargetConstant(8, DL, MVT::i8)); | |||
| 16898 | if (!IsSingleInput) | |||
| 16899 | V2 = DAG.getNode(X86ISD::VSRLI, DL, MVT::v8i16, | |||
| 16900 | DAG.getBitcast(MVT::v8i16, V2), | |||
| 16901 | DAG.getTargetConstant(8, DL, MVT::i8)); | |||
| 16902 | return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, | |||
| 16903 | IsSingleInput ? V1 : V2); | |||
| 16904 | } | |||
| 16905 | ||||
| 16906 | // Handle multi-input cases by blending/unpacking single-input shuffles. | |||
| 16907 | if (NumV2Elements > 0) | |||
| 16908 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v16i8, V1, V2, Mask, | |||
| 16909 | Subtarget, DAG); | |||
| 16910 | ||||
| 16911 | // The fallback path for single-input shuffles widens this into two v8i16 | |||
| 16912 | // vectors with unpacks, shuffles those, and then pulls them back together | |||
| 16913 | // with a pack. | |||
| 16914 | SDValue V = V1; | |||
| 16915 | ||||
| 16916 | std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; | |||
| 16917 | std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}}; | |||
| 16918 | for (int i = 0; i < 16; ++i) | |||
| 16919 | if (Mask[i] >= 0) | |||
| 16920 | (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i]; | |||
| 16921 | ||||
| 16922 | SDValue VLoHalf, VHiHalf; | |||
| 16923 | // Check if any of the odd lanes in the v16i8 are used. If not, we can mask | |||
| 16924 | // them out and avoid using UNPCK{L,H} to extract the elements of V as | |||
| 16925 | // i16s. | |||
| 16926 | if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) && | |||
| 16927 | none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) { | |||
| 16928 | // Use a mask to drop the high bytes. | |||
| 16929 | VLoHalf = DAG.getBitcast(MVT::v8i16, V); | |||
| 16930 | VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf, | |||
| 16931 | DAG.getConstant(0x00FF, DL, MVT::v8i16)); | |||
| 16932 | ||||
| 16933 | // This will be a single vector shuffle instead of a blend so nuke VHiHalf. | |||
| 16934 | VHiHalf = DAG.getUNDEF(MVT::v8i16); | |||
| 16935 | ||||
| 16936 | // Squash the masks to point directly into VLoHalf. | |||
| 16937 | for (int &M : LoBlendMask) | |||
| 16938 | if (M >= 0) | |||
| 16939 | M /= 2; | |||
| 16940 | for (int &M : HiBlendMask) | |||
| 16941 | if (M >= 0) | |||
| 16942 | M /= 2; | |||
| 16943 | } else { | |||
| 16944 | // Otherwise just unpack the low half of V into VLoHalf and the high half into | |||
| 16945 | // VHiHalf so that we can blend them as i16s. | |||
| 16946 | SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL); | |||
| 16947 | ||||
| 16948 | VLoHalf = DAG.getBitcast( | |||
| 16949 | MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); | |||
| 16950 | VHiHalf = DAG.getBitcast( | |||
| 16951 | MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); | |||
| 16952 | } | |||
| 16953 | ||||
| 16954 | SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask); | |||
| 16955 | SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask); | |||
| 16956 | ||||
| 16957 | return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); | |||
| 16958 | } | |||
| 16959 | ||||
| 16960 | /// Dispatching routine to lower various 128-bit x86 vector shuffles. | |||
| 16961 | /// | |||
| 16962 | /// This routine breaks down the specific type of 128-bit shuffle and | |||
| 16963 | /// dispatches to the lowering routines accordingly. | |||
| 16964 | static SDValue lower128BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 16965 | MVT VT, SDValue V1, SDValue V2, | |||
| 16966 | const APInt &Zeroable, | |||
| 16967 | const X86Subtarget &Subtarget, | |||
| 16968 | SelectionDAG &DAG) { | |||
| 16969 | switch (VT.SimpleTy) { | |||
| 16970 | case MVT::v2i64: | |||
| 16971 | return lowerV2I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16972 | case MVT::v2f64: | |||
| 16973 | return lowerV2F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16974 | case MVT::v4i32: | |||
| 16975 | return lowerV4I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16976 | case MVT::v4f32: | |||
| 16977 | return lowerV4F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16978 | case MVT::v8i16: | |||
| 16979 | return lowerV8I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16980 | case MVT::v8f16: | |||
| 16981 | return lowerV8F16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16982 | case MVT::v16i8: | |||
| 16983 | return lowerV16I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 16984 | ||||
| 16985 | default: | |||
| 16986 | llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 16986); | |||
| 16987 | } | |||
| 16988 | } | |||
| 16989 | ||||
| 16990 | /// Generic routine to split vector shuffle into half-sized shuffles. | |||
| 16991 | /// | |||
| 16992 | /// This routine just extracts two subvectors, shuffles them independently, and | |||
| 16993 | /// then concatenates them back together. This should work effectively with all | |||
| 16994 | /// AVX vector shuffle types. | |||
| 16995 | static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 16996 | SDValue V2, ArrayRef<int> Mask, | |||
| 16997 | SelectionDAG &DAG, bool SimpleOnly) { | |||
| 16998 | assert(VT.getSizeInBits() >= 256 &&(static_cast <bool> (VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__ __PRETTY_FUNCTION__)) | |||
| 16999 | "Only for 256-bit or wider vector shuffles!")(static_cast <bool> (VT.getSizeInBits() >= 256 && "Only for 256-bit or wider vector shuffles!") ? void (0) : __assert_fail ("VT.getSizeInBits() >= 256 && \"Only for 256-bit or wider vector shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 16999, __extension__ __PRETTY_FUNCTION__)); | |||
| 17000 | assert(V1.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == VT && "Bad operand type!") ? void (0) : __assert_fail ("V1.getSimpleValueType() == VT && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17000, __extension__ __PRETTY_FUNCTION__)); | |||
| 17001 | assert(V2.getSimpleValueType() == VT && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == VT && "Bad operand type!") ? void (0) : __assert_fail ("V2.getSimpleValueType() == VT && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17001, __extension__ __PRETTY_FUNCTION__)); | |||
| 17002 | ||||
| 17003 | ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2); | |||
| 17004 | ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2); | |||
| 17005 | ||||
| 17006 | int NumElements = VT.getVectorNumElements(); | |||
| 17007 | int SplitNumElements = NumElements / 2; | |||
| 17008 | MVT ScalarVT = VT.getVectorElementType(); | |||
| 17009 | MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements); | |||
| 17010 | ||||
| 17011 | // Use splitVector/extractSubVector so that split build-vectors just build two | |||
| 17012 | // narrower build vectors. This helps shuffling with splats and zeros. | |||
| 17013 | auto SplitVector = [&](SDValue V) { | |||
| 17014 | SDValue LoV, HiV; | |||
| 17015 | std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL); | |||
| 17016 | return std::make_pair(DAG.getBitcast(SplitVT, LoV), | |||
| 17017 | DAG.getBitcast(SplitVT, HiV)); | |||
| 17018 | }; | |||
| 17019 | ||||
| 17020 | SDValue LoV1, HiV1, LoV2, HiV2; | |||
| 17021 | std::tie(LoV1, HiV1) = SplitVector(V1); | |||
| 17022 | std::tie(LoV2, HiV2) = SplitVector(V2); | |||
| 17023 | ||||
| 17024 | // Now create two 4-way blends of these half-width vectors. | |||
| 17025 | auto GetHalfBlendPiecesReq = [&](const ArrayRef<int> &HalfMask, bool &UseLoV1, | |||
| 17026 | bool &UseHiV1, bool &UseLoV2, | |||
| 17027 | bool &UseHiV2) { | |||
| 17028 | UseLoV1 = UseHiV1 = UseLoV2 = UseHiV2 = false; | |||
| 17029 | for (int i = 0; i < SplitNumElements; ++i) { | |||
| 17030 | int M = HalfMask[i]; | |||
| 17031 | if (M >= NumElements) { | |||
| 17032 | if (M >= NumElements + SplitNumElements) | |||
| 17033 | UseHiV2 = true; | |||
| 17034 | else | |||
| 17035 | UseLoV2 = true; | |||
| 17036 | } else if (M >= 0) { | |||
| 17037 | if (M >= SplitNumElements) | |||
| 17038 | UseHiV1 = true; | |||
| 17039 | else | |||
| 17040 | UseLoV1 = true; | |||
| 17041 | } | |||
| 17042 | } | |||
| 17043 | }; | |||
| 17044 | ||||
| 17045 | auto CheckHalfBlendUsable = [&](const ArrayRef<int> &HalfMask) -> bool { | |||
| 17046 | if (!SimpleOnly) | |||
| 17047 | return true; | |||
| 17048 | ||||
| 17049 | bool UseLoV1, UseHiV1, UseLoV2, UseHiV2; | |||
| 17050 | GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2); | |||
| 17051 | ||||
| 17052 | return !(UseHiV1 || UseHiV2); | |||
| 17053 | }; | |||
| 17054 | ||||
| 17055 | auto HalfBlend = [&](ArrayRef<int> HalfMask) { | |||
| 17056 | SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1); | |||
| 17057 | SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1); | |||
| 17058 | SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1); | |||
| 17059 | for (int i = 0; i < SplitNumElements; ++i) { | |||
| 17060 | int M = HalfMask[i]; | |||
| 17061 | if (M >= NumElements) { | |||
| 17062 | V2BlendMask[i] = M - NumElements; | |||
| 17063 | BlendMask[i] = SplitNumElements + i; | |||
| 17064 | } else if (M >= 0) { | |||
| 17065 | V1BlendMask[i] = M; | |||
| 17066 | BlendMask[i] = i; | |||
| 17067 | } | |||
| 17068 | } | |||
| 17069 | ||||
| 17070 | bool UseLoV1, UseHiV1, UseLoV2, UseHiV2; | |||
| 17071 | GetHalfBlendPiecesReq(HalfMask, UseLoV1, UseHiV1, UseLoV2, UseHiV2); | |||
| 17072 | ||||
| 17073 | // Because the lowering happens after all combining takes place, we need to | |||
| 17074 | // manually combine these blend masks as much as possible so that we create | |||
| 17075 | // a minimal number of high-level vector shuffle nodes. | |||
| 17076 | assert((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple")(static_cast <bool> ((!SimpleOnly || (!UseHiV1 && !UseHiV2)) && "Shuffle isn't simple") ? void (0) : __assert_fail ("(!SimpleOnly || (!UseHiV1 && !UseHiV2)) && \"Shuffle isn't simple\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17076, __extension__ __PRETTY_FUNCTION__)); | |||
| 17077 | ||||
| 17078 | // First try just blending the halves of V1 or V2. | |||
| 17079 | if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2) | |||
| 17080 | return DAG.getUNDEF(SplitVT); | |||
| 17081 | if (!UseLoV2 && !UseHiV2) | |||
| 17082 | return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); | |||
| 17083 | if (!UseLoV1 && !UseHiV1) | |||
| 17084 | return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); | |||
| 17085 | ||||
| 17086 | SDValue V1Blend, V2Blend; | |||
| 17087 | if (UseLoV1 && UseHiV1) { | |||
| 17088 | V1Blend = DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask); | |||
| 17089 | } else { | |||
| 17090 | // We only use half of V1 so map the usage down into the final blend mask. | |||
| 17091 | V1Blend = UseLoV1 ? LoV1 : HiV1; | |||
| 17092 | for (int i = 0; i < SplitNumElements; ++i) | |||
| 17093 | if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements) | |||
| 17094 | BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements); | |||
| 17095 | } | |||
| 17096 | if (UseLoV2 && UseHiV2) { | |||
| 17097 | V2Blend = DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask); | |||
| 17098 | } else { | |||
| 17099 | // We only use half of V2 so map the usage down into the final blend mask. | |||
| 17100 | V2Blend = UseLoV2 ? LoV2 : HiV2; | |||
| 17101 | for (int i = 0; i < SplitNumElements; ++i) | |||
| 17102 | if (BlendMask[i] >= SplitNumElements) | |||
| 17103 | BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0); | |||
| 17104 | } | |||
| 17105 | return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask); | |||
| 17106 | }; | |||
| 17107 | ||||
| 17108 | if (!CheckHalfBlendUsable(LoMask) || !CheckHalfBlendUsable(HiMask)) | |||
| 17109 | return SDValue(); | |||
| 17110 | ||||
| 17111 | SDValue Lo = HalfBlend(LoMask); | |||
| 17112 | SDValue Hi = HalfBlend(HiMask); | |||
| 17113 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); | |||
| 17114 | } | |||
| 17115 | ||||
| 17116 | /// Either split a vector in halves or decompose the shuffles and the | |||
| 17117 | /// blend/unpack. | |||
| 17118 | /// | |||
| 17119 | /// This is provided as a good fallback for many lowerings of non-single-input | |||
| 17120 | /// shuffles with more than one 128-bit lane. In those cases, we want to select | |||
| 17121 | /// between splitting the shuffle into 128-bit components and stitching those | |||
| 17122 | /// back together vs. extracting the single-input shuffles and blending those | |||
| 17123 | /// results. | |||
| 17124 | static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 17125 | SDValue V2, ArrayRef<int> Mask, | |||
| 17126 | const X86Subtarget &Subtarget, | |||
| 17127 | SelectionDAG &DAG) { | |||
| 17128 | assert(!V2.isUndef() && "This routine must not be used to lower single-input "(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself.") ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__ __PRETTY_FUNCTION__)) | |||
| 17129 | "shuffles as it could then recurse on itself.")(static_cast <bool> (!V2.isUndef() && "This routine must not be used to lower single-input " "shuffles as it could then recurse on itself.") ? void (0) : __assert_fail ("!V2.isUndef() && \"This routine must not be used to lower single-input \" \"shuffles as it could then recurse on itself.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17129, __extension__ __PRETTY_FUNCTION__)); | |||
| 17130 | int Size = Mask.size(); | |||
| 17131 | ||||
| 17132 | // If this can be modeled as a broadcast of two elements followed by a blend, | |||
| 17133 | // prefer that lowering. This is especially important because broadcasts can | |||
| 17134 | // often fold with memory operands. | |||
| 17135 | auto DoBothBroadcast = [&] { | |||
| 17136 | int V1BroadcastIdx = -1, V2BroadcastIdx = -1; | |||
| 17137 | for (int M : Mask) | |||
| 17138 | if (M >= Size) { | |||
| 17139 | if (V2BroadcastIdx < 0) | |||
| 17140 | V2BroadcastIdx = M - Size; | |||
| 17141 | else if (M - Size != V2BroadcastIdx) | |||
| 17142 | return false; | |||
| 17143 | } else if (M >= 0) { | |||
| 17144 | if (V1BroadcastIdx < 0) | |||
| 17145 | V1BroadcastIdx = M; | |||
| 17146 | else if (M != V1BroadcastIdx) | |||
| 17147 | return false; | |||
| 17148 | } | |||
| 17149 | return true; | |||
| 17150 | }; | |||
| 17151 | if (DoBothBroadcast()) | |||
| 17152 | return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, | |||
| 17153 | DAG); | |||
| 17154 | ||||
| 17155 | // If the inputs all stem from a single 128-bit lane of each input, then we | |||
| 17156 | // split them rather than blending because the split will decompose to | |||
| 17157 | // unusually few instructions. | |||
| 17158 | int LaneCount = VT.getSizeInBits() / 128; | |||
| 17159 | int LaneSize = Size / LaneCount; | |||
| 17160 | SmallBitVector LaneInputs[2]; | |||
| 17161 | LaneInputs[0].resize(LaneCount, false); | |||
| 17162 | LaneInputs[1].resize(LaneCount, false); | |||
| 17163 | for (int i = 0; i < Size; ++i) | |||
| 17164 | if (Mask[i] >= 0) | |||
| 17165 | LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true; | |||
| 17166 | if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1) | |||
| 17167 | return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, | |||
| 17168 | /*SimpleOnly*/ false); | |||
| 17169 | ||||
| 17170 | // Otherwise, just fall back to decomposed shuffles and a blend/unpack. This | |||
| 17171 | // requires that the decomposed single-input shuffles don't end up here. | |||
| 17172 | return lowerShuffleAsDecomposedShuffleMerge(DL, VT, V1, V2, Mask, Subtarget, | |||
| 17173 | DAG); | |||
| 17174 | } | |||
| 17175 | ||||
| 17176 | // Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). | |||
| 17177 | // TODO: Extend to support v8f32 (+ 512-bit shuffles). | |||
| 17178 | static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT, | |||
| 17179 | SDValue V1, SDValue V2, | |||
| 17180 | ArrayRef<int> Mask, | |||
| 17181 | SelectionDAG &DAG) { | |||
| 17182 | assert(VT == MVT::v4f64 && "Only for v4f64 shuffles")(static_cast <bool> (VT == MVT::v4f64 && "Only for v4f64 shuffles" ) ? void (0) : __assert_fail ("VT == MVT::v4f64 && \"Only for v4f64 shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17182, __extension__ __PRETTY_FUNCTION__)); | |||
| 17183 | ||||
| 17184 | int LHSMask[4] = {-1, -1, -1, -1}; | |||
| 17185 | int RHSMask[4] = {-1, -1, -1, -1}; | |||
| 17186 | unsigned SHUFPMask = 0; | |||
| 17187 | ||||
| 17188 | // As SHUFPD uses a single LHS/RHS element per lane, we can always | |||
| 17189 | // perform the shuffle once the lanes have been shuffled in place. | |||
| 17190 | for (int i = 0; i != 4; ++i) { | |||
| 17191 | int M = Mask[i]; | |||
| 17192 | if (M < 0) | |||
| 17193 | continue; | |||
| 17194 | int LaneBase = i & ~1; | |||
| 17195 | auto &LaneMask = (i & 1) ? RHSMask : LHSMask; | |||
| 17196 | LaneMask[LaneBase + (M & 1)] = M; | |||
| 17197 | SHUFPMask |= (M & 1) << i; | |||
| 17198 | } | |||
| 17199 | ||||
| 17200 | SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask); | |||
| 17201 | SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask); | |||
| 17202 | return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS, | |||
| 17203 | DAG.getTargetConstant(SHUFPMask, DL, MVT::i8)); | |||
| 17204 | } | |||
| 17205 | ||||
| 17206 | /// Lower a vector shuffle crossing multiple 128-bit lanes as | |||
| 17207 | /// a lane permutation followed by a per-lane permutation. | |||
| 17208 | /// | |||
| 17209 | /// This is mainly for cases where we can have non-repeating permutes | |||
| 17210 | /// in each lane. | |||
| 17211 | /// | |||
| 17212 | /// TODO: This is very similar to lowerShuffleAsLanePermuteAndRepeatedMask, | |||
| 17213 | /// we should investigate merging them. | |||
| 17214 | static SDValue lowerShuffleAsLanePermuteAndPermute( | |||
| 17215 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 17216 | SelectionDAG &DAG, const X86Subtarget &Subtarget) { | |||
| 17217 | int NumElts = VT.getVectorNumElements(); | |||
| 17218 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 17219 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 17220 | bool CanUseSublanes = Subtarget.hasAVX2() && V2.isUndef(); | |||
| 17221 | ||||
| 17222 | /// Attempts to find a sublane permute with the given size | |||
| 17223 | /// that gets all elements into their target lanes. | |||
| 17224 | /// | |||
| 17225 | /// If successful, fills CrossLaneMask and InLaneMask and returns true. | |||
| 17226 | /// If unsuccessful, returns false and may overwrite InLaneMask. | |||
| 17227 | auto getSublanePermute = [&](int NumSublanes) -> SDValue { | |||
| 17228 | int NumSublanesPerLane = NumSublanes / NumLanes; | |||
| 17229 | int NumEltsPerSublane = NumElts / NumSublanes; | |||
| 17230 | ||||
| 17231 | SmallVector<int, 16> CrossLaneMask; | |||
| 17232 | SmallVector<int, 16> InLaneMask(NumElts, SM_SentinelUndef); | |||
| 17233 | // CrossLaneMask but one entry == one sublane. | |||
| 17234 | SmallVector<int, 16> CrossLaneMaskLarge(NumSublanes, SM_SentinelUndef); | |||
| 17235 | ||||
| 17236 | for (int i = 0; i != NumElts; ++i) { | |||
| 17237 | int M = Mask[i]; | |||
| 17238 | if (M < 0) | |||
| 17239 | continue; | |||
| 17240 | ||||
| 17241 | int SrcSublane = M / NumEltsPerSublane; | |||
| 17242 | int DstLane = i / NumEltsPerLane; | |||
| 17243 | ||||
| 17244 | // We only need to get the elements into the right lane, not sublane. | |||
| 17245 | // So search all sublanes that make up the destination lane. | |||
| 17246 | bool Found = false; | |||
| 17247 | int DstSubStart = DstLane * NumSublanesPerLane; | |||
| 17248 | int DstSubEnd = DstSubStart + NumSublanesPerLane; | |||
| 17249 | for (int DstSublane = DstSubStart; DstSublane < DstSubEnd; ++DstSublane) { | |||
| 17250 | if (!isUndefOrEqual(CrossLaneMaskLarge[DstSublane], SrcSublane)) | |||
| 17251 | continue; | |||
| 17252 | ||||
| 17253 | Found = true; | |||
| 17254 | CrossLaneMaskLarge[DstSublane] = SrcSublane; | |||
| 17255 | int DstSublaneOffset = DstSublane * NumEltsPerSublane; | |||
| 17256 | InLaneMask[i] = DstSublaneOffset + M % NumEltsPerSublane; | |||
| 17257 | break; | |||
| 17258 | } | |||
| 17259 | if (!Found) | |||
| 17260 | return SDValue(); | |||
| 17261 | } | |||
| 17262 | ||||
| 17263 | // Fill CrossLaneMask using CrossLaneMaskLarge. | |||
| 17264 | narrowShuffleMaskElts(NumEltsPerSublane, CrossLaneMaskLarge, CrossLaneMask); | |||
| 17265 | ||||
| 17266 | if (!CanUseSublanes) { | |||
| 17267 | // If we're only shuffling a single lowest lane and the rest are identity | |||
| 17268 | // then don't bother. | |||
| 17269 | // TODO - isShuffleMaskInputInPlace could be extended to something like | |||
| 17270 | // this. | |||
| 17271 | int NumIdentityLanes = 0; | |||
| 17272 | bool OnlyShuffleLowestLane = true; | |||
| 17273 | for (int i = 0; i != NumLanes; ++i) { | |||
| 17274 | int LaneOffset = i * NumEltsPerLane; | |||
| 17275 | if (isSequentialOrUndefInRange(InLaneMask, LaneOffset, NumEltsPerLane, | |||
| 17276 | i * NumEltsPerLane)) | |||
| 17277 | NumIdentityLanes++; | |||
| 17278 | else if (CrossLaneMask[LaneOffset] != 0) | |||
| 17279 | OnlyShuffleLowestLane = false; | |||
| 17280 | } | |||
| 17281 | if (OnlyShuffleLowestLane && NumIdentityLanes == (NumLanes - 1)) | |||
| 17282 | return SDValue(); | |||
| 17283 | } | |||
| 17284 | ||||
| 17285 | // Avoid returning the same shuffle operation. For example, | |||
| 17286 | // t7: v16i16 = vector_shuffle<8,9,10,11,4,5,6,7,0,1,2,3,12,13,14,15> t5, | |||
| 17287 | // undef:v16i16 | |||
| 17288 | if (CrossLaneMask == Mask || InLaneMask == Mask) | |||
| 17289 | return SDValue(); | |||
| 17290 | ||||
| 17291 | SDValue CrossLane = DAG.getVectorShuffle(VT, DL, V1, V2, CrossLaneMask); | |||
| 17292 | return DAG.getVectorShuffle(VT, DL, CrossLane, DAG.getUNDEF(VT), | |||
| 17293 | InLaneMask); | |||
| 17294 | }; | |||
| 17295 | ||||
| 17296 | // First attempt a solution with full lanes. | |||
| 17297 | if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes)) | |||
| 17298 | return V; | |||
| 17299 | ||||
| 17300 | // The rest of the solutions use sublanes. | |||
| 17301 | if (!CanUseSublanes) | |||
| 17302 | return SDValue(); | |||
| 17303 | ||||
| 17304 | // Then attempt a solution with 64-bit sublanes (vpermq). | |||
| 17305 | if (SDValue V = getSublanePermute(/*NumSublanes=*/NumLanes * 2)) | |||
| 17306 | return V; | |||
| 17307 | ||||
| 17308 | // If that doesn't work and we have fast variable cross-lane shuffle, | |||
| 17309 | // attempt 32-bit sublanes (vpermd). | |||
| 17310 | if (!Subtarget.hasFastVariableCrossLaneShuffle()) | |||
| 17311 | return SDValue(); | |||
| 17312 | ||||
| 17313 | return getSublanePermute(/*NumSublanes=*/NumLanes * 4); | |||
| 17314 | } | |||
| 17315 | ||||
| 17316 | /// Helper to get compute inlane shuffle mask for a complete shuffle mask. | |||
| 17317 | static void computeInLaneShuffleMask(const ArrayRef<int> &Mask, int LaneSize, | |||
| 17318 | SmallVector<int> &InLaneMask) { | |||
| 17319 | int Size = Mask.size(); | |||
| 17320 | InLaneMask.assign(Mask.begin(), Mask.end()); | |||
| 17321 | for (int i = 0; i < Size; ++i) { | |||
| 17322 | int &M = InLaneMask[i]; | |||
| 17323 | if (M < 0) | |||
| 17324 | continue; | |||
| 17325 | if (((M % Size) / LaneSize) != (i / LaneSize)) | |||
| 17326 | M = (M % LaneSize) + ((i / LaneSize) * LaneSize) + Size; | |||
| 17327 | } | |||
| 17328 | } | |||
| 17329 | ||||
| 17330 | /// Lower a vector shuffle crossing multiple 128-bit lanes by shuffling one | |||
| 17331 | /// source with a lane permutation. | |||
| 17332 | /// | |||
| 17333 | /// This lowering strategy results in four instructions in the worst case for a | |||
| 17334 | /// single-input cross lane shuffle which is lower than any other fully general | |||
| 17335 | /// cross-lane shuffle strategy I'm aware of. Special cases for each particular | |||
| 17336 | /// shuffle pattern should be handled prior to trying this lowering. | |||
| 17337 | static SDValue lowerShuffleAsLanePermuteAndShuffle( | |||
| 17338 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 17339 | SelectionDAG &DAG, const X86Subtarget &Subtarget) { | |||
| 17340 | // FIXME: This should probably be generalized for 512-bit vectors as well. | |||
| 17341 | assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!")(static_cast <bool> (VT.is256BitVector() && "Only for 256-bit vector shuffles!" ) ? void (0) : __assert_fail ("VT.is256BitVector() && \"Only for 256-bit vector shuffles!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17341, __extension__ __PRETTY_FUNCTION__)); | |||
| 17342 | int Size = Mask.size(); | |||
| 17343 | int LaneSize = Size / 2; | |||
| 17344 | ||||
| 17345 | // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). | |||
| 17346 | // Only do this if the elements aren't all from the lower lane, | |||
| 17347 | // otherwise we're (probably) better off doing a split. | |||
| 17348 | if (VT == MVT::v4f64 && | |||
| 17349 | !all_of(Mask, [LaneSize](int M) { return M < LaneSize; })) | |||
| 17350 | return lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG); | |||
| 17351 | ||||
| 17352 | // If there are only inputs from one 128-bit lane, splitting will in fact be | |||
| 17353 | // less expensive. The flags track whether the given lane contains an element | |||
| 17354 | // that crosses to another lane. | |||
| 17355 | bool AllLanes; | |||
| 17356 | if (!Subtarget.hasAVX2()) { | |||
| 17357 | bool LaneCrossing[2] = {false, false}; | |||
| 17358 | for (int i = 0; i < Size; ++i) | |||
| 17359 | if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize)) | |||
| 17360 | LaneCrossing[(Mask[i] % Size) / LaneSize] = true; | |||
| 17361 | AllLanes = LaneCrossing[0] && LaneCrossing[1]; | |||
| 17362 | } else { | |||
| 17363 | bool LaneUsed[2] = {false, false}; | |||
| 17364 | for (int i = 0; i < Size; ++i) | |||
| 17365 | if (Mask[i] >= 0) | |||
| 17366 | LaneUsed[(Mask[i] % Size) / LaneSize] = true; | |||
| 17367 | AllLanes = LaneUsed[0] && LaneUsed[1]; | |||
| 17368 | } | |||
| 17369 | ||||
| 17370 | // TODO - we could support shuffling V2 in the Flipped input. | |||
| 17371 | assert(V2.isUndef() &&(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles" ) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__ __PRETTY_FUNCTION__)) | |||
| 17372 | "This last part of this routine only works on single input shuffles")(static_cast <bool> (V2.isUndef() && "This last part of this routine only works on single input shuffles" ) ? void (0) : __assert_fail ("V2.isUndef() && \"This last part of this routine only works on single input shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17372, __extension__ __PRETTY_FUNCTION__)); | |||
| 17373 | ||||
| 17374 | SmallVector<int> InLaneMask; | |||
| 17375 | computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask); | |||
| 17376 | ||||
| 17377 | assert(!is128BitLaneCrossingShuffleMask(VT, InLaneMask) &&(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT , InLaneMask) && "In-lane shuffle mask expected") ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__ __PRETTY_FUNCTION__)) | |||
| 17378 | "In-lane shuffle mask expected")(static_cast <bool> (!is128BitLaneCrossingShuffleMask(VT , InLaneMask) && "In-lane shuffle mask expected") ? void (0) : __assert_fail ("!is128BitLaneCrossingShuffleMask(VT, InLaneMask) && \"In-lane shuffle mask expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17378, __extension__ __PRETTY_FUNCTION__)); | |||
| 17379 | ||||
| 17380 | // If we're not using both lanes in each lane and the inlane mask is not | |||
| 17381 | // repeating, then we're better off splitting. | |||
| 17382 | if (!AllLanes && !is128BitLaneRepeatedShuffleMask(VT, InLaneMask)) | |||
| 17383 | return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, | |||
| 17384 | /*SimpleOnly*/ false); | |||
| 17385 | ||||
| 17386 | // Flip the lanes, and shuffle the results which should now be in-lane. | |||
| 17387 | MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; | |||
| 17388 | SDValue Flipped = DAG.getBitcast(PVT, V1); | |||
| 17389 | Flipped = | |||
| 17390 | DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), {2, 3, 0, 1}); | |||
| 17391 | Flipped = DAG.getBitcast(VT, Flipped); | |||
| 17392 | return DAG.getVectorShuffle(VT, DL, V1, Flipped, InLaneMask); | |||
| 17393 | } | |||
| 17394 | ||||
| 17395 | /// Handle lowering 2-lane 128-bit shuffles. | |||
| 17396 | static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 17397 | SDValue V2, ArrayRef<int> Mask, | |||
| 17398 | const APInt &Zeroable, | |||
| 17399 | const X86Subtarget &Subtarget, | |||
| 17400 | SelectionDAG &DAG) { | |||
| 17401 | if (V2.isUndef()) { | |||
| 17402 | // Attempt to match VBROADCAST*128 subvector broadcast load. | |||
| 17403 | bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1); | |||
| 17404 | bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); | |||
| 17405 | if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && | |||
| 17406 | X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { | |||
| 17407 | MVT MemVT = VT.getHalfNumVectorElementsVT(); | |||
| 17408 | unsigned Ofs = SplatLo ? 0 : MemVT.getStoreSize(); | |||
| 17409 | auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1)); | |||
| 17410 | if (SDValue BcstLd = getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, DL, | |||
| 17411 | VT, MemVT, Ld, Ofs, DAG)) | |||
| 17412 | return BcstLd; | |||
| 17413 | } | |||
| 17414 | ||||
| 17415 | // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. | |||
| 17416 | if (Subtarget.hasAVX2()) | |||
| 17417 | return SDValue(); | |||
| 17418 | } | |||
| 17419 | ||||
| 17420 | bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode()); | |||
| 17421 | ||||
| 17422 | SmallVector<int, 4> WidenedMask; | |||
| 17423 | if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask)) | |||
| 17424 | return SDValue(); | |||
| 17425 | ||||
| 17426 | bool IsLowZero = (Zeroable & 0x3) == 0x3; | |||
| 17427 | bool IsHighZero = (Zeroable & 0xc) == 0xc; | |||
| 17428 | ||||
| 17429 | // Try to use an insert into a zero vector. | |||
| 17430 | if (WidenedMask[0] == 0 && IsHighZero) { | |||
| 17431 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); | |||
| 17432 | SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, | |||
| 17433 | DAG.getIntPtrConstant(0, DL)); | |||
| 17434 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, | |||
| 17435 | getZeroVector(VT, Subtarget, DAG, DL), LoV, | |||
| 17436 | DAG.getIntPtrConstant(0, DL)); | |||
| 17437 | } | |||
| 17438 | ||||
| 17439 | // TODO: If minimizing size and one of the inputs is a zero vector and the | |||
| 17440 | // the zero vector has only one use, we could use a VPERM2X128 to save the | |||
| 17441 | // instruction bytes needed to explicitly generate the zero vector. | |||
| 17442 | ||||
| 17443 | // Blends are faster and handle all the non-lane-crossing cases. | |||
| 17444 | if (SDValue Blend = lowerShuffleAsBlend(DL, VT, V1, V2, Mask, Zeroable, | |||
| 17445 | Subtarget, DAG)) | |||
| 17446 | return Blend; | |||
| 17447 | ||||
| 17448 | // If either input operand is a zero vector, use VPERM2X128 because its mask | |||
| 17449 | // allows us to replace the zero input with an implicit zero. | |||
| 17450 | if (!IsLowZero && !IsHighZero) { | |||
| 17451 | // Check for patterns which can be matched with a single insert of a 128-bit | |||
| 17452 | // subvector. | |||
| 17453 | bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1, V2); | |||
| 17454 | if (OnlyUsesV1 || isShuffleEquivalent(Mask, {0, 1, 4, 5}, V1, V2)) { | |||
| 17455 | ||||
| 17456 | // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, | |||
| 17457 | // this will likely become vinsertf128 which can't fold a 256-bit memop. | |||
| 17458 | if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) { | |||
| 17459 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); | |||
| 17460 | SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, | |||
| 17461 | OnlyUsesV1 ? V1 : V2, | |||
| 17462 | DAG.getIntPtrConstant(0, DL)); | |||
| 17463 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, | |||
| 17464 | DAG.getIntPtrConstant(2, DL)); | |||
| 17465 | } | |||
| 17466 | } | |||
| 17467 | ||||
| 17468 | // Try to use SHUF128 if possible. | |||
| 17469 | if (Subtarget.hasVLX()) { | |||
| 17470 | if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { | |||
| 17471 | unsigned PermMask = ((WidenedMask[0] % 2) << 0) | | |||
| 17472 | ((WidenedMask[1] % 2) << 1); | |||
| 17473 | return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, | |||
| 17474 | DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 17475 | } | |||
| 17476 | } | |||
| 17477 | } | |||
| 17478 | ||||
| 17479 | // Otherwise form a 128-bit permutation. After accounting for undefs, | |||
| 17480 | // convert the 64-bit shuffle mask selection values into 128-bit | |||
| 17481 | // selection bits by dividing the indexes by 2 and shifting into positions | |||
| 17482 | // defined by a vperm2*128 instruction's immediate control byte. | |||
| 17483 | ||||
| 17484 | // The immediate permute control byte looks like this: | |||
| 17485 | // [1:0] - select 128 bits from sources for low half of destination | |||
| 17486 | // [2] - ignore | |||
| 17487 | // [3] - zero low half of destination | |||
| 17488 | // [5:4] - select 128 bits from sources for high half of destination | |||
| 17489 | // [6] - ignore | |||
| 17490 | // [7] - zero high half of destination | |||
| 17491 | ||||
| 17492 | assert((WidenedMask[0] >= 0 || IsLowZero) &&(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero ) && (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__ __PRETTY_FUNCTION__)) | |||
| 17493 | (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?")(static_cast <bool> ((WidenedMask[0] >= 0 || IsLowZero ) && (WidenedMask[1] >= 0 || IsHighZero) && "Undef half?") ? void (0) : __assert_fail ("(WidenedMask[0] >= 0 || IsLowZero) && (WidenedMask[1] >= 0 || IsHighZero) && \"Undef half?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17493, __extension__ __PRETTY_FUNCTION__)); | |||
| 17494 | ||||
| 17495 | unsigned PermMask = 0; | |||
| 17496 | PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); | |||
| 17497 | PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); | |||
| 17498 | ||||
| 17499 | // Check the immediate mask and replace unused sources with undef. | |||
| 17500 | if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) | |||
| 17501 | V1 = DAG.getUNDEF(VT); | |||
| 17502 | if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) | |||
| 17503 | V2 = DAG.getUNDEF(VT); | |||
| 17504 | ||||
| 17505 | return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, | |||
| 17506 | DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 17507 | } | |||
| 17508 | ||||
| 17509 | /// Lower a vector shuffle by first fixing the 128-bit lanes and then | |||
| 17510 | /// shuffling each lane. | |||
| 17511 | /// | |||
| 17512 | /// This attempts to create a repeated lane shuffle where each lane uses one | |||
| 17513 | /// or two of the lanes of the inputs. The lanes of the input vectors are | |||
| 17514 | /// shuffled in one or two independent shuffles to get the lanes into the | |||
| 17515 | /// position needed by the final shuffle. | |||
| 17516 | static SDValue lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 17517 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 17518 | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 17519 | assert(!V2.isUndef() && "This is only useful with multiple inputs.")(static_cast <bool> (!V2.isUndef() && "This is only useful with multiple inputs." ) ? void (0) : __assert_fail ("!V2.isUndef() && \"This is only useful with multiple inputs.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17519, __extension__ __PRETTY_FUNCTION__)); | |||
| 17520 | ||||
| 17521 | if (is128BitLaneRepeatedShuffleMask(VT, Mask)) | |||
| 17522 | return SDValue(); | |||
| 17523 | ||||
| 17524 | int NumElts = Mask.size(); | |||
| 17525 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 17526 | int NumLaneElts = 128 / VT.getScalarSizeInBits(); | |||
| 17527 | SmallVector<int, 16> RepeatMask(NumLaneElts, -1); | |||
| 17528 | SmallVector<std::array<int, 2>, 2> LaneSrcs(NumLanes, {{-1, -1}}); | |||
| 17529 | ||||
| 17530 | // First pass will try to fill in the RepeatMask from lanes that need two | |||
| 17531 | // sources. | |||
| 17532 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 17533 | int Srcs[2] = {-1, -1}; | |||
| 17534 | SmallVector<int, 16> InLaneMask(NumLaneElts, -1); | |||
| 17535 | for (int i = 0; i != NumLaneElts; ++i) { | |||
| 17536 | int M = Mask[(Lane * NumLaneElts) + i]; | |||
| 17537 | if (M < 0) | |||
| 17538 | continue; | |||
| 17539 | // Determine which of the possible input lanes (NumLanes from each source) | |||
| 17540 | // this element comes from. Assign that as one of the sources for this | |||
| 17541 | // lane. We can assign up to 2 sources for this lane. If we run out | |||
| 17542 | // sources we can't do anything. | |||
| 17543 | int LaneSrc = M / NumLaneElts; | |||
| 17544 | int Src; | |||
| 17545 | if (Srcs[0] < 0 || Srcs[0] == LaneSrc) | |||
| 17546 | Src = 0; | |||
| 17547 | else if (Srcs[1] < 0 || Srcs[1] == LaneSrc) | |||
| 17548 | Src = 1; | |||
| 17549 | else | |||
| 17550 | return SDValue(); | |||
| 17551 | ||||
| 17552 | Srcs[Src] = LaneSrc; | |||
| 17553 | InLaneMask[i] = (M % NumLaneElts) + Src * NumElts; | |||
| 17554 | } | |||
| 17555 | ||||
| 17556 | // If this lane has two sources, see if it fits with the repeat mask so far. | |||
| 17557 | if (Srcs[1] < 0) | |||
| 17558 | continue; | |||
| 17559 | ||||
| 17560 | LaneSrcs[Lane][0] = Srcs[0]; | |||
| 17561 | LaneSrcs[Lane][1] = Srcs[1]; | |||
| 17562 | ||||
| 17563 | auto MatchMasks = [](ArrayRef<int> M1, ArrayRef<int> M2) { | |||
| 17564 | assert(M1.size() == M2.size() && "Unexpected mask size")(static_cast <bool> (M1.size() == M2.size() && "Unexpected mask size" ) ? void (0) : __assert_fail ("M1.size() == M2.size() && \"Unexpected mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17564, __extension__ __PRETTY_FUNCTION__)); | |||
| 17565 | for (int i = 0, e = M1.size(); i != e; ++i) | |||
| 17566 | if (M1[i] >= 0 && M2[i] >= 0 && M1[i] != M2[i]) | |||
| 17567 | return false; | |||
| 17568 | return true; | |||
| 17569 | }; | |||
| 17570 | ||||
| 17571 | auto MergeMasks = [](ArrayRef<int> Mask, MutableArrayRef<int> MergedMask) { | |||
| 17572 | assert(Mask.size() == MergedMask.size() && "Unexpected mask size")(static_cast <bool> (Mask.size() == MergedMask.size() && "Unexpected mask size") ? void (0) : __assert_fail ("Mask.size() == MergedMask.size() && \"Unexpected mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17572, __extension__ __PRETTY_FUNCTION__)); | |||
| 17573 | for (int i = 0, e = MergedMask.size(); i != e; ++i) { | |||
| 17574 | int M = Mask[i]; | |||
| 17575 | if (M < 0) | |||
| 17576 | continue; | |||
| 17577 | assert((MergedMask[i] < 0 || MergedMask[i] == M) &&(static_cast <bool> ((MergedMask[i] < 0 || MergedMask [i] == M) && "Unexpected mask element") ? void (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__ __PRETTY_FUNCTION__)) | |||
| 17578 | "Unexpected mask element")(static_cast <bool> ((MergedMask[i] < 0 || MergedMask [i] == M) && "Unexpected mask element") ? void (0) : __assert_fail ("(MergedMask[i] < 0 || MergedMask[i] == M) && \"Unexpected mask element\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17578, __extension__ __PRETTY_FUNCTION__)); | |||
| 17579 | MergedMask[i] = M; | |||
| 17580 | } | |||
| 17581 | }; | |||
| 17582 | ||||
| 17583 | if (MatchMasks(InLaneMask, RepeatMask)) { | |||
| 17584 | // Merge this lane mask into the final repeat mask. | |||
| 17585 | MergeMasks(InLaneMask, RepeatMask); | |||
| 17586 | continue; | |||
| 17587 | } | |||
| 17588 | ||||
| 17589 | // Didn't find a match. Swap the operands and try again. | |||
| 17590 | std::swap(LaneSrcs[Lane][0], LaneSrcs[Lane][1]); | |||
| 17591 | ShuffleVectorSDNode::commuteMask(InLaneMask); | |||
| 17592 | ||||
| 17593 | if (MatchMasks(InLaneMask, RepeatMask)) { | |||
| 17594 | // Merge this lane mask into the final repeat mask. | |||
| 17595 | MergeMasks(InLaneMask, RepeatMask); | |||
| 17596 | continue; | |||
| 17597 | } | |||
| 17598 | ||||
| 17599 | // Couldn't find a match with the operands in either order. | |||
| 17600 | return SDValue(); | |||
| 17601 | } | |||
| 17602 | ||||
| 17603 | // Now handle any lanes with only one source. | |||
| 17604 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 17605 | // If this lane has already been processed, skip it. | |||
| 17606 | if (LaneSrcs[Lane][0] >= 0) | |||
| 17607 | continue; | |||
| 17608 | ||||
| 17609 | for (int i = 0; i != NumLaneElts; ++i) { | |||
| 17610 | int M = Mask[(Lane * NumLaneElts) + i]; | |||
| 17611 | if (M < 0) | |||
| 17612 | continue; | |||
| 17613 | ||||
| 17614 | // If RepeatMask isn't defined yet we can define it ourself. | |||
| 17615 | if (RepeatMask[i] < 0) | |||
| 17616 | RepeatMask[i] = M % NumLaneElts; | |||
| 17617 | ||||
| 17618 | if (RepeatMask[i] < NumElts) { | |||
| 17619 | if (RepeatMask[i] != M % NumLaneElts) | |||
| 17620 | return SDValue(); | |||
| 17621 | LaneSrcs[Lane][0] = M / NumLaneElts; | |||
| 17622 | } else { | |||
| 17623 | if (RepeatMask[i] != ((M % NumLaneElts) + NumElts)) | |||
| 17624 | return SDValue(); | |||
| 17625 | LaneSrcs[Lane][1] = M / NumLaneElts; | |||
| 17626 | } | |||
| 17627 | } | |||
| 17628 | ||||
| 17629 | if (LaneSrcs[Lane][0] < 0 && LaneSrcs[Lane][1] < 0) | |||
| 17630 | return SDValue(); | |||
| 17631 | } | |||
| 17632 | ||||
| 17633 | SmallVector<int, 16> NewMask(NumElts, -1); | |||
| 17634 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 17635 | int Src = LaneSrcs[Lane][0]; | |||
| 17636 | for (int i = 0; i != NumLaneElts; ++i) { | |||
| 17637 | int M = -1; | |||
| 17638 | if (Src >= 0) | |||
| 17639 | M = Src * NumLaneElts + i; | |||
| 17640 | NewMask[Lane * NumLaneElts + i] = M; | |||
| 17641 | } | |||
| 17642 | } | |||
| 17643 | SDValue NewV1 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); | |||
| 17644 | // Ensure we didn't get back the shuffle we started with. | |||
| 17645 | // FIXME: This is a hack to make up for some splat handling code in | |||
| 17646 | // getVectorShuffle. | |||
| 17647 | if (isa<ShuffleVectorSDNode>(NewV1) && | |||
| 17648 | cast<ShuffleVectorSDNode>(NewV1)->getMask() == Mask) | |||
| 17649 | return SDValue(); | |||
| 17650 | ||||
| 17651 | for (int Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 17652 | int Src = LaneSrcs[Lane][1]; | |||
| 17653 | for (int i = 0; i != NumLaneElts; ++i) { | |||
| 17654 | int M = -1; | |||
| 17655 | if (Src >= 0) | |||
| 17656 | M = Src * NumLaneElts + i; | |||
| 17657 | NewMask[Lane * NumLaneElts + i] = M; | |||
| 17658 | } | |||
| 17659 | } | |||
| 17660 | SDValue NewV2 = DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); | |||
| 17661 | // Ensure we didn't get back the shuffle we started with. | |||
| 17662 | // FIXME: This is a hack to make up for some splat handling code in | |||
| 17663 | // getVectorShuffle. | |||
| 17664 | if (isa<ShuffleVectorSDNode>(NewV2) && | |||
| 17665 | cast<ShuffleVectorSDNode>(NewV2)->getMask() == Mask) | |||
| 17666 | return SDValue(); | |||
| 17667 | ||||
| 17668 | for (int i = 0; i != NumElts; ++i) { | |||
| 17669 | if (Mask[i] < 0) { | |||
| 17670 | NewMask[i] = -1; | |||
| 17671 | continue; | |||
| 17672 | } | |||
| 17673 | NewMask[i] = RepeatMask[i % NumLaneElts]; | |||
| 17674 | if (NewMask[i] < 0) | |||
| 17675 | continue; | |||
| 17676 | ||||
| 17677 | NewMask[i] += (i / NumLaneElts) * NumLaneElts; | |||
| 17678 | } | |||
| 17679 | return DAG.getVectorShuffle(VT, DL, NewV1, NewV2, NewMask); | |||
| 17680 | } | |||
| 17681 | ||||
| 17682 | /// If the input shuffle mask results in a vector that is undefined in all upper | |||
| 17683 | /// or lower half elements and that mask accesses only 2 halves of the | |||
| 17684 | /// shuffle's operands, return true. A mask of half the width with mask indexes | |||
| 17685 | /// adjusted to access the extracted halves of the original shuffle operands is | |||
| 17686 | /// returned in HalfMask. HalfIdx1 and HalfIdx2 return whether the upper or | |||
| 17687 | /// lower half of each input operand is accessed. | |||
| 17688 | static bool | |||
| 17689 | getHalfShuffleMask(ArrayRef<int> Mask, MutableArrayRef<int> HalfMask, | |||
| 17690 | int &HalfIdx1, int &HalfIdx2) { | |||
| 17691 | assert((Mask.size() == HalfMask.size() * 2) &&(static_cast <bool> ((Mask.size() == HalfMask.size() * 2 ) && "Expected input mask to be twice as long as output" ) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__ __PRETTY_FUNCTION__)) | |||
| 17692 | "Expected input mask to be twice as long as output")(static_cast <bool> ((Mask.size() == HalfMask.size() * 2 ) && "Expected input mask to be twice as long as output" ) ? void (0) : __assert_fail ("(Mask.size() == HalfMask.size() * 2) && \"Expected input mask to be twice as long as output\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17692, __extension__ __PRETTY_FUNCTION__)); | |||
| 17693 | ||||
| 17694 | // Exactly one half of the result must be undef to allow narrowing. | |||
| 17695 | bool UndefLower = isUndefLowerHalf(Mask); | |||
| 17696 | bool UndefUpper = isUndefUpperHalf(Mask); | |||
| 17697 | if (UndefLower == UndefUpper) | |||
| 17698 | return false; | |||
| 17699 | ||||
| 17700 | unsigned HalfNumElts = HalfMask.size(); | |||
| 17701 | unsigned MaskIndexOffset = UndefLower ? HalfNumElts : 0; | |||
| 17702 | HalfIdx1 = -1; | |||
| 17703 | HalfIdx2 = -1; | |||
| 17704 | for (unsigned i = 0; i != HalfNumElts; ++i) { | |||
| 17705 | int M = Mask[i + MaskIndexOffset]; | |||
| 17706 | if (M < 0) { | |||
| 17707 | HalfMask[i] = M; | |||
| 17708 | continue; | |||
| 17709 | } | |||
| 17710 | ||||
| 17711 | // Determine which of the 4 half vectors this element is from. | |||
| 17712 | // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2. | |||
| 17713 | int HalfIdx = M / HalfNumElts; | |||
| 17714 | ||||
| 17715 | // Determine the element index into its half vector source. | |||
| 17716 | int HalfElt = M % HalfNumElts; | |||
| 17717 | ||||
| 17718 | // We can shuffle with up to 2 half vectors, set the new 'half' | |||
| 17719 | // shuffle mask accordingly. | |||
| 17720 | if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) { | |||
| 17721 | HalfMask[i] = HalfElt; | |||
| 17722 | HalfIdx1 = HalfIdx; | |||
| 17723 | continue; | |||
| 17724 | } | |||
| 17725 | if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) { | |||
| 17726 | HalfMask[i] = HalfElt + HalfNumElts; | |||
| 17727 | HalfIdx2 = HalfIdx; | |||
| 17728 | continue; | |||
| 17729 | } | |||
| 17730 | ||||
| 17731 | // Too many half vectors referenced. | |||
| 17732 | return false; | |||
| 17733 | } | |||
| 17734 | ||||
| 17735 | return true; | |||
| 17736 | } | |||
| 17737 | ||||
| 17738 | /// Given the output values from getHalfShuffleMask(), create a half width | |||
| 17739 | /// shuffle of extracted vectors followed by an insert back to full width. | |||
| 17740 | static SDValue getShuffleHalfVectors(const SDLoc &DL, SDValue V1, SDValue V2, | |||
| 17741 | ArrayRef<int> HalfMask, int HalfIdx1, | |||
| 17742 | int HalfIdx2, bool UndefLower, | |||
| 17743 | SelectionDAG &DAG, bool UseConcat = false) { | |||
| 17744 | assert(V1.getValueType() == V2.getValueType() && "Different sized vectors?")(static_cast <bool> (V1.getValueType() == V2.getValueType () && "Different sized vectors?") ? void (0) : __assert_fail ("V1.getValueType() == V2.getValueType() && \"Different sized vectors?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17744, __extension__ __PRETTY_FUNCTION__)); | |||
| 17745 | assert(V1.getValueType().isSimple() && "Expecting only simple types")(static_cast <bool> (V1.getValueType().isSimple() && "Expecting only simple types") ? void (0) : __assert_fail ("V1.getValueType().isSimple() && \"Expecting only simple types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17745, __extension__ __PRETTY_FUNCTION__)); | |||
| 17746 | ||||
| 17747 | MVT VT = V1.getSimpleValueType(); | |||
| 17748 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 17749 | unsigned HalfNumElts = HalfVT.getVectorNumElements(); | |||
| 17750 | ||||
| 17751 | auto getHalfVector = [&](int HalfIdx) { | |||
| 17752 | if (HalfIdx < 0) | |||
| 17753 | return DAG.getUNDEF(HalfVT); | |||
| 17754 | SDValue V = (HalfIdx < 2 ? V1 : V2); | |||
| 17755 | HalfIdx = (HalfIdx % 2) * HalfNumElts; | |||
| 17756 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V, | |||
| 17757 | DAG.getIntPtrConstant(HalfIdx, DL)); | |||
| 17758 | }; | |||
| 17759 | ||||
| 17760 | // ins undef, (shuf (ext V1, HalfIdx1), (ext V2, HalfIdx2), HalfMask), Offset | |||
| 17761 | SDValue Half1 = getHalfVector(HalfIdx1); | |||
| 17762 | SDValue Half2 = getHalfVector(HalfIdx2); | |||
| 17763 | SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask); | |||
| 17764 | if (UseConcat) { | |||
| 17765 | SDValue Op0 = V; | |||
| 17766 | SDValue Op1 = DAG.getUNDEF(HalfVT); | |||
| 17767 | if (UndefLower) | |||
| 17768 | std::swap(Op0, Op1); | |||
| 17769 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Op0, Op1); | |||
| 17770 | } | |||
| 17771 | ||||
| 17772 | unsigned Offset = UndefLower ? HalfNumElts : 0; | |||
| 17773 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, | |||
| 17774 | DAG.getIntPtrConstant(Offset, DL)); | |||
| 17775 | } | |||
| 17776 | ||||
| 17777 | /// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. | |||
| 17778 | /// This allows for fast cases such as subvector extraction/insertion | |||
| 17779 | /// or shuffling smaller vector types which can lower more efficiently. | |||
| 17780 | static SDValue lowerShuffleWithUndefHalf(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 17781 | SDValue V2, ArrayRef<int> Mask, | |||
| 17782 | const X86Subtarget &Subtarget, | |||
| 17783 | SelectionDAG &DAG) { | |||
| 17784 | assert((VT.is256BitVector() || VT.is512BitVector()) &&(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector ()) && "Expected 256-bit or 512-bit vector") ? void ( 0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__ __PRETTY_FUNCTION__)) | |||
| 17785 | "Expected 256-bit or 512-bit vector")(static_cast <bool> ((VT.is256BitVector() || VT.is512BitVector ()) && "Expected 256-bit or 512-bit vector") ? void ( 0) : __assert_fail ("(VT.is256BitVector() || VT.is512BitVector()) && \"Expected 256-bit or 512-bit vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17785, __extension__ __PRETTY_FUNCTION__)); | |||
| 17786 | ||||
| 17787 | bool UndefLower = isUndefLowerHalf(Mask); | |||
| 17788 | if (!UndefLower && !isUndefUpperHalf(Mask)) | |||
| 17789 | return SDValue(); | |||
| 17790 | ||||
| 17791 | assert((!UndefLower || !isUndefUpperHalf(Mask)) &&(static_cast <bool> ((!UndefLower || !isUndefUpperHalf( Mask)) && "Completely undef shuffle mask should have been simplified already" ) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__ __PRETTY_FUNCTION__)) | |||
| 17792 | "Completely undef shuffle mask should have been simplified already")(static_cast <bool> ((!UndefLower || !isUndefUpperHalf( Mask)) && "Completely undef shuffle mask should have been simplified already" ) ? void (0) : __assert_fail ("(!UndefLower || !isUndefUpperHalf(Mask)) && \"Completely undef shuffle mask should have been simplified already\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17792, __extension__ __PRETTY_FUNCTION__)); | |||
| 17793 | ||||
| 17794 | // Upper half is undef and lower half is whole upper subvector. | |||
| 17795 | // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u> | |||
| 17796 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 17797 | unsigned HalfNumElts = HalfVT.getVectorNumElements(); | |||
| 17798 | if (!UndefLower && | |||
| 17799 | isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) { | |||
| 17800 | SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, | |||
| 17801 | DAG.getIntPtrConstant(HalfNumElts, DL)); | |||
| 17802 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, | |||
| 17803 | DAG.getIntPtrConstant(0, DL)); | |||
| 17804 | } | |||
| 17805 | ||||
| 17806 | // Lower half is undef and upper half is whole lower subvector. | |||
| 17807 | // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1> | |||
| 17808 | if (UndefLower && | |||
| 17809 | isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) { | |||
| 17810 | SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1, | |||
| 17811 | DAG.getIntPtrConstant(0, DL)); | |||
| 17812 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi, | |||
| 17813 | DAG.getIntPtrConstant(HalfNumElts, DL)); | |||
| 17814 | } | |||
| 17815 | ||||
| 17816 | int HalfIdx1, HalfIdx2; | |||
| 17817 | SmallVector<int, 8> HalfMask(HalfNumElts); | |||
| 17818 | if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2)) | |||
| 17819 | return SDValue(); | |||
| 17820 | ||||
| 17821 | assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length")(static_cast <bool> (HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length") ? void (0) : __assert_fail ("HalfMask.size() == HalfNumElts && \"Unexpected shuffle mask length\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17821, __extension__ __PRETTY_FUNCTION__)); | |||
| 17822 | ||||
| 17823 | // Only shuffle the halves of the inputs when useful. | |||
| 17824 | unsigned NumLowerHalves = | |||
| 17825 | (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2); | |||
| 17826 | unsigned NumUpperHalves = | |||
| 17827 | (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3); | |||
| 17828 | assert(NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed")(static_cast <bool> (NumLowerHalves + NumUpperHalves <= 2 && "Only 1 or 2 halves allowed") ? void (0) : __assert_fail ("NumLowerHalves + NumUpperHalves <= 2 && \"Only 1 or 2 halves allowed\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17828, __extension__ __PRETTY_FUNCTION__)); | |||
| 17829 | ||||
| 17830 | // Determine the larger pattern of undef/halves, then decide if it's worth | |||
| 17831 | // splitting the shuffle based on subtarget capabilities and types. | |||
| 17832 | unsigned EltWidth = VT.getVectorElementType().getSizeInBits(); | |||
| 17833 | if (!UndefLower) { | |||
| 17834 | // XXXXuuuu: no insert is needed. | |||
| 17835 | // Always extract lowers when setting lower - these are all free subreg ops. | |||
| 17836 | if (NumUpperHalves == 0) | |||
| 17837 | return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, | |||
| 17838 | UndefLower, DAG); | |||
| 17839 | ||||
| 17840 | if (NumUpperHalves == 1) { | |||
| 17841 | // AVX2 has efficient 32/64-bit element cross-lane shuffles. | |||
| 17842 | if (Subtarget.hasAVX2()) { | |||
| 17843 | // extract128 + vunpckhps/vshufps, is better than vblend + vpermps. | |||
| 17844 | if (EltWidth == 32 && NumLowerHalves && HalfVT.is128BitVector() && | |||
| 17845 | !is128BitUnpackShuffleMask(HalfMask, DAG) && | |||
| 17846 | (!isSingleSHUFPSMask(HalfMask) || | |||
| 17847 | Subtarget.hasFastVariableCrossLaneShuffle())) | |||
| 17848 | return SDValue(); | |||
| 17849 | // If this is a unary shuffle (assume that the 2nd operand is | |||
| 17850 | // canonicalized to undef), then we can use vpermpd. Otherwise, we | |||
| 17851 | // are better off extracting the upper half of 1 operand and using a | |||
| 17852 | // narrow shuffle. | |||
| 17853 | if (EltWidth == 64 && V2.isUndef()) | |||
| 17854 | return SDValue(); | |||
| 17855 | } | |||
| 17856 | // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. | |||
| 17857 | if (Subtarget.hasAVX512() && VT.is512BitVector()) | |||
| 17858 | return SDValue(); | |||
| 17859 | // Extract + narrow shuffle is better than the wide alternative. | |||
| 17860 | return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, | |||
| 17861 | UndefLower, DAG); | |||
| 17862 | } | |||
| 17863 | ||||
| 17864 | // Don't extract both uppers, instead shuffle and then extract. | |||
| 17865 | assert(NumUpperHalves == 2 && "Half vector count went wrong")(static_cast <bool> (NumUpperHalves == 2 && "Half vector count went wrong" ) ? void (0) : __assert_fail ("NumUpperHalves == 2 && \"Half vector count went wrong\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 17865, __extension__ __PRETTY_FUNCTION__)); | |||
| 17866 | return SDValue(); | |||
| 17867 | } | |||
| 17868 | ||||
| 17869 | // UndefLower - uuuuXXXX: an insert to high half is required if we split this. | |||
| 17870 | if (NumUpperHalves == 0) { | |||
| 17871 | // AVX2 has efficient 64-bit element cross-lane shuffles. | |||
| 17872 | // TODO: Refine to account for unary shuffle, splat, and other masks? | |||
| 17873 | if (Subtarget.hasAVX2() && EltWidth == 64) | |||
| 17874 | return SDValue(); | |||
| 17875 | // AVX512 has efficient cross-lane shuffles for all legal 512-bit types. | |||
| 17876 | if (Subtarget.hasAVX512() && VT.is512BitVector()) | |||
| 17877 | return SDValue(); | |||
| 17878 | // Narrow shuffle + insert is better than the wide alternative. | |||
| 17879 | return getShuffleHalfVectors(DL, V1, V2, HalfMask, HalfIdx1, HalfIdx2, | |||
| 17880 | UndefLower, DAG); | |||
| 17881 | } | |||
| 17882 | ||||
| 17883 | // NumUpperHalves != 0: don't bother with extract, shuffle, and then insert. | |||
| 17884 | return SDValue(); | |||
| 17885 | } | |||
| 17886 | ||||
| 17887 | /// Handle case where shuffle sources are coming from the same 128-bit lane and | |||
| 17888 | /// every lane can be represented as the same repeating mask - allowing us to | |||
| 17889 | /// shuffle the sources with the repeating shuffle and then permute the result | |||
| 17890 | /// to the destination lanes. | |||
| 17891 | static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 17892 | const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, | |||
| 17893 | const X86Subtarget &Subtarget, SelectionDAG &DAG) { | |||
| 17894 | int NumElts = VT.getVectorNumElements(); | |||
| 17895 | int NumLanes = VT.getSizeInBits() / 128; | |||
| 17896 | int NumLaneElts = NumElts / NumLanes; | |||
| 17897 | ||||
| 17898 | // On AVX2 we may be able to just shuffle the lowest elements and then | |||
| 17899 | // broadcast the result. | |||
| 17900 | if (Subtarget.hasAVX2()) { | |||
| 17901 | for (unsigned BroadcastSize : {16, 32, 64}) { | |||
| 17902 | if (BroadcastSize <= VT.getScalarSizeInBits()) | |||
| 17903 | continue; | |||
| 17904 | int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits(); | |||
| 17905 | ||||
| 17906 | // Attempt to match a repeating pattern every NumBroadcastElts, | |||
| 17907 | // accounting for UNDEFs but only references the lowest 128-bit | |||
| 17908 | // lane of the inputs. | |||
| 17909 | auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) { | |||
| 17910 | for (int i = 0; i != NumElts; i += NumBroadcastElts) | |||
| 17911 | for (int j = 0; j != NumBroadcastElts; ++j) { | |||
| 17912 | int M = Mask[i + j]; | |||
| 17913 | if (M < 0) | |||
| 17914 | continue; | |||
| 17915 | int &R = RepeatMask[j]; | |||
| 17916 | if (0 != ((M % NumElts) / NumLaneElts)) | |||
| 17917 | return false; | |||
| 17918 | if (0 <= R && R != M) | |||
| 17919 | return false; | |||
| 17920 | R = M; | |||
| 17921 | } | |||
| 17922 | return true; | |||
| 17923 | }; | |||
| 17924 | ||||
| 17925 | SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1); | |||
| 17926 | if (!FindRepeatingBroadcastMask(RepeatMask)) | |||
| 17927 | continue; | |||
| 17928 | ||||
| 17929 | // Shuffle the (lowest) repeated elements in place for broadcast. | |||
| 17930 | SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask); | |||
| 17931 | ||||
| 17932 | // Shuffle the actual broadcast. | |||
| 17933 | SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1); | |||
| 17934 | for (int i = 0; i != NumElts; i += NumBroadcastElts) | |||
| 17935 | for (int j = 0; j != NumBroadcastElts; ++j) | |||
| 17936 | BroadcastMask[i + j] = j; | |||
| 17937 | return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT), | |||
| 17938 | BroadcastMask); | |||
| 17939 | } | |||
| 17940 | } | |||
| 17941 | ||||
| 17942 | // Bail if the shuffle mask doesn't cross 128-bit lanes. | |||
| 17943 | if (!is128BitLaneCrossingShuffleMask(VT, Mask)) | |||
| 17944 | return SDValue(); | |||
| 17945 | ||||
| 17946 | // Bail if we already have a repeated lane shuffle mask. | |||
| 17947 | if (is128BitLaneRepeatedShuffleMask(VT, Mask)) | |||
| 17948 | return SDValue(); | |||
| 17949 | ||||
| 17950 | // Helper to look for repeated mask in each split sublane, and that those | |||
| 17951 | // sublanes can then be permuted into place. | |||
| 17952 | auto ShuffleSubLanes = [&](int SubLaneScale) { | |||
| 17953 | int NumSubLanes = NumLanes * SubLaneScale; | |||
| 17954 | int NumSubLaneElts = NumLaneElts / SubLaneScale; | |||
| 17955 | ||||
| 17956 | // Check that all the sources are coming from the same lane and see if we | |||
| 17957 | // can form a repeating shuffle mask (local to each sub-lane). At the same | |||
| 17958 | // time, determine the source sub-lane for each destination sub-lane. | |||
| 17959 | int TopSrcSubLane = -1; | |||
| 17960 | SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1); | |||
| 17961 | SmallVector<SmallVector<int, 8>> RepeatedSubLaneMasks( | |||
| 17962 | SubLaneScale, | |||
| 17963 | SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)); | |||
| 17964 | ||||
| 17965 | for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) { | |||
| 17966 | // Extract the sub-lane mask, check that it all comes from the same lane | |||
| 17967 | // and normalize the mask entries to come from the first lane. | |||
| 17968 | int SrcLane = -1; | |||
| 17969 | SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1); | |||
| 17970 | for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { | |||
| 17971 | int M = Mask[(DstSubLane * NumSubLaneElts) + Elt]; | |||
| 17972 | if (M < 0) | |||
| 17973 | continue; | |||
| 17974 | int Lane = (M % NumElts) / NumLaneElts; | |||
| 17975 | if ((0 <= SrcLane) && (SrcLane != Lane)) | |||
| 17976 | return SDValue(); | |||
| 17977 | SrcLane = Lane; | |||
| 17978 | int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts); | |||
| 17979 | SubLaneMask[Elt] = LocalM; | |||
| 17980 | } | |||
| 17981 | ||||
| 17982 | // Whole sub-lane is UNDEF. | |||
| 17983 | if (SrcLane < 0) | |||
| 17984 | continue; | |||
| 17985 | ||||
| 17986 | // Attempt to match against the candidate repeated sub-lane masks. | |||
| 17987 | for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) { | |||
| 17988 | auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) { | |||
| 17989 | for (int i = 0; i != NumSubLaneElts; ++i) { | |||
| 17990 | if (M1[i] < 0 || M2[i] < 0) | |||
| 17991 | continue; | |||
| 17992 | if (M1[i] != M2[i]) | |||
| 17993 | return false; | |||
| 17994 | } | |||
| 17995 | return true; | |||
| 17996 | }; | |||
| 17997 | ||||
| 17998 | auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane]; | |||
| 17999 | if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask)) | |||
| 18000 | continue; | |||
| 18001 | ||||
| 18002 | // Merge the sub-lane mask into the matching repeated sub-lane mask. | |||
| 18003 | for (int i = 0; i != NumSubLaneElts; ++i) { | |||
| 18004 | int M = SubLaneMask[i]; | |||
| 18005 | if (M < 0) | |||
| 18006 | continue; | |||
| 18007 | assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask [i] == M) && "Unexpected mask element") ? void (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__ __PRETTY_FUNCTION__)) | |||
| 18008 | "Unexpected mask element")(static_cast <bool> ((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask [i] == M) && "Unexpected mask element") ? void (0) : __assert_fail ("(RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) && \"Unexpected mask element\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18008, __extension__ __PRETTY_FUNCTION__)); | |||
| 18009 | RepeatedSubLaneMask[i] = M; | |||
| 18010 | } | |||
| 18011 | ||||
| 18012 | // Track the top most source sub-lane - by setting the remaining to | |||
| 18013 | // UNDEF we can greatly simplify shuffle matching. | |||
| 18014 | int SrcSubLane = (SrcLane * SubLaneScale) + SubLane; | |||
| 18015 | TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane); | |||
| 18016 | Dst2SrcSubLanes[DstSubLane] = SrcSubLane; | |||
| 18017 | break; | |||
| 18018 | } | |||
| 18019 | ||||
| 18020 | // Bail if we failed to find a matching repeated sub-lane mask. | |||
| 18021 | if (Dst2SrcSubLanes[DstSubLane] < 0) | |||
| 18022 | return SDValue(); | |||
| 18023 | } | |||
| 18024 | assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane") ? void (0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__ __PRETTY_FUNCTION__)) | |||
| 18025 | "Unexpected source lane")(static_cast <bool> (0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && "Unexpected source lane") ? void (0) : __assert_fail ("0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes && \"Unexpected source lane\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18025, __extension__ __PRETTY_FUNCTION__)); | |||
| 18026 | ||||
| 18027 | // Create a repeating shuffle mask for the entire vector. | |||
| 18028 | SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1); | |||
| 18029 | for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) { | |||
| 18030 | int Lane = SubLane / SubLaneScale; | |||
| 18031 | auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale]; | |||
| 18032 | for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) { | |||
| 18033 | int M = RepeatedSubLaneMask[Elt]; | |||
| 18034 | if (M < 0) | |||
| 18035 | continue; | |||
| 18036 | int Idx = (SubLane * NumSubLaneElts) + Elt; | |||
| 18037 | RepeatedMask[Idx] = M + (Lane * NumLaneElts); | |||
| 18038 | } | |||
| 18039 | } | |||
| 18040 | ||||
| 18041 | // Shuffle each source sub-lane to its destination. | |||
| 18042 | SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1); | |||
| 18043 | for (int i = 0; i != NumElts; i += NumSubLaneElts) { | |||
| 18044 | int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts]; | |||
| 18045 | if (SrcSubLane < 0) | |||
| 18046 | continue; | |||
| 18047 | for (int j = 0; j != NumSubLaneElts; ++j) | |||
| 18048 | SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts); | |||
| 18049 | } | |||
| 18050 | ||||
| 18051 | // Avoid returning the same shuffle operation. | |||
| 18052 | // v8i32 = vector_shuffle<0,1,4,5,2,3,6,7> t5, undef:v8i32 | |||
| 18053 | if (RepeatedMask == Mask || SubLaneMask == Mask) | |||
| 18054 | return SDValue(); | |||
| 18055 | ||||
| 18056 | SDValue RepeatedShuffle = | |||
| 18057 | DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask); | |||
| 18058 | ||||
| 18059 | return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT), | |||
| 18060 | SubLaneMask); | |||
| 18061 | }; | |||
| 18062 | ||||
| 18063 | // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes | |||
| 18064 | // (with PERMQ/PERMPD). On AVX2/AVX512BW targets, permuting 32-bit sub-lanes, | |||
| 18065 | // even with a variable shuffle, can be worth it for v32i8/v64i8 vectors. | |||
| 18066 | // Otherwise we can only permute whole 128-bit lanes. | |||
| 18067 | int MinSubLaneScale = 1, MaxSubLaneScale = 1; | |||
| 18068 | if (Subtarget.hasAVX2() && VT.is256BitVector()) { | |||
| 18069 | bool OnlyLowestElts = isUndefOrInRange(Mask, 0, NumLaneElts); | |||
| 18070 | MinSubLaneScale = 2; | |||
| 18071 | MaxSubLaneScale = | |||
| 18072 | (!OnlyLowestElts && V2.isUndef() && VT == MVT::v32i8) ? 4 : 2; | |||
| 18073 | } | |||
| 18074 | if (Subtarget.hasBWI() && VT == MVT::v64i8) | |||
| 18075 | MinSubLaneScale = MaxSubLaneScale = 4; | |||
| 18076 | ||||
| 18077 | for (int Scale = MinSubLaneScale; Scale <= MaxSubLaneScale; Scale *= 2) | |||
| 18078 | if (SDValue Shuffle = ShuffleSubLanes(Scale)) | |||
| 18079 | return Shuffle; | |||
| 18080 | ||||
| 18081 | return SDValue(); | |||
| 18082 | } | |||
| 18083 | ||||
| 18084 | static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, | |||
| 18085 | bool &ForceV1Zero, bool &ForceV2Zero, | |||
| 18086 | unsigned &ShuffleImm, ArrayRef<int> Mask, | |||
| 18087 | const APInt &Zeroable) { | |||
| 18088 | int NumElts = VT.getVectorNumElements(); | |||
| 18089 | assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__ __PRETTY_FUNCTION__)) | |||
| 18090 | (NumElts == 2 || NumElts == 4 || NumElts == 8) &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__ __PRETTY_FUNCTION__)) | |||
| 18091 | "Unexpected data type for VSHUFPD")(static_cast <bool> (VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && "Unexpected data type for VSHUFPD" ) ? void (0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && \"Unexpected data type for VSHUFPD\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18091, __extension__ __PRETTY_FUNCTION__)); | |||
| 18092 | assert(isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) &&(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__ __PRETTY_FUNCTION__)) | |||
| 18093 | "Illegal shuffle mask")(static_cast <bool> (isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && "Illegal shuffle mask") ? void (0) : __assert_fail ("isUndefOrZeroOrInRange(Mask, 0, 2 * NumElts) && \"Illegal shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18093, __extension__ __PRETTY_FUNCTION__)); | |||
| 18094 | ||||
| 18095 | bool ZeroLane[2] = { true, true }; | |||
| 18096 | for (int i = 0; i < NumElts; ++i) | |||
| 18097 | ZeroLane[i & 1] &= Zeroable[i]; | |||
| 18098 | ||||
| 18099 | // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. | |||
| 18100 | // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. | |||
| 18101 | ShuffleImm = 0; | |||
| 18102 | bool ShufpdMask = true; | |||
| 18103 | bool CommutableMask = true; | |||
| 18104 | for (int i = 0; i < NumElts; ++i) { | |||
| 18105 | if (Mask[i] == SM_SentinelUndef || ZeroLane[i & 1]) | |||
| 18106 | continue; | |||
| 18107 | if (Mask[i] < 0) | |||
| 18108 | return false; | |||
| 18109 | int Val = (i & 6) + NumElts * (i & 1); | |||
| 18110 | int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1); | |||
| 18111 | if (Mask[i] < Val || Mask[i] > Val + 1) | |||
| 18112 | ShufpdMask = false; | |||
| 18113 | if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1) | |||
| 18114 | CommutableMask = false; | |||
| 18115 | ShuffleImm |= (Mask[i] % 2) << i; | |||
| 18116 | } | |||
| 18117 | ||||
| 18118 | if (!ShufpdMask && !CommutableMask) | |||
| 18119 | return false; | |||
| 18120 | ||||
| 18121 | if (!ShufpdMask && CommutableMask) | |||
| 18122 | std::swap(V1, V2); | |||
| 18123 | ||||
| 18124 | ForceV1Zero = ZeroLane[0]; | |||
| 18125 | ForceV2Zero = ZeroLane[1]; | |||
| 18126 | return true; | |||
| 18127 | } | |||
| 18128 | ||||
| 18129 | static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, | |||
| 18130 | SDValue V2, ArrayRef<int> Mask, | |||
| 18131 | const APInt &Zeroable, | |||
| 18132 | const X86Subtarget &Subtarget, | |||
| 18133 | SelectionDAG &DAG) { | |||
| 18134 | assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) &&(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD" ) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__ __PRETTY_FUNCTION__)) | |||
| 18135 | "Unexpected data type for VSHUFPD")(static_cast <bool> ((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD" ) ? void (0) : __assert_fail ("(VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && \"Unexpected data type for VSHUFPD\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18135, __extension__ __PRETTY_FUNCTION__)); | |||
| 18136 | ||||
| 18137 | unsigned Immediate = 0; | |||
| 18138 | bool ForceV1Zero = false, ForceV2Zero = false; | |||
| 18139 | if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, | |||
| 18140 | Mask, Zeroable)) | |||
| 18141 | return SDValue(); | |||
| 18142 | ||||
| 18143 | // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. | |||
| 18144 | if (ForceV1Zero) | |||
| 18145 | V1 = getZeroVector(VT, Subtarget, DAG, DL); | |||
| 18146 | if (ForceV2Zero) | |||
| 18147 | V2 = getZeroVector(VT, Subtarget, DAG, DL); | |||
| 18148 | ||||
| 18149 | return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2, | |||
| 18150 | DAG.getTargetConstant(Immediate, DL, MVT::i8)); | |||
| 18151 | } | |||
| 18152 | ||||
| 18153 | // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed | |||
| 18154 | // by zeroable elements in the remaining 24 elements. Turn this into two | |||
| 18155 | // vmovqb instructions shuffled together. | |||
| 18156 | static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, | |||
| 18157 | SDValue V1, SDValue V2, | |||
| 18158 | ArrayRef<int> Mask, | |||
| 18159 | const APInt &Zeroable, | |||
| 18160 | SelectionDAG &DAG) { | |||
| 18161 | assert(VT == MVT::v32i8 && "Unexpected type!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected type!" ) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18161, __extension__ __PRETTY_FUNCTION__)); | |||
| 18162 | ||||
| 18163 | // The first 8 indices should be every 8th element. | |||
| 18164 | if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) | |||
| 18165 | return SDValue(); | |||
| 18166 | ||||
| 18167 | // Remaining elements need to be zeroable. | |||
| 18168 | if (Zeroable.countl_one() < (Mask.size() - 8)) | |||
| 18169 | return SDValue(); | |||
| 18170 | ||||
| 18171 | V1 = DAG.getBitcast(MVT::v4i64, V1); | |||
| 18172 | V2 = DAG.getBitcast(MVT::v4i64, V2); | |||
| 18173 | ||||
| 18174 | V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); | |||
| 18175 | V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); | |||
| 18176 | ||||
| 18177 | // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in | |||
| 18178 | // the upper bits of the result using an unpckldq. | |||
| 18179 | SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, | |||
| 18180 | { 0, 1, 2, 3, 16, 17, 18, 19, | |||
| 18181 | 4, 5, 6, 7, 20, 21, 22, 23 }); | |||
| 18182 | // Insert the unpckldq into a zero vector to widen to v32i8. | |||
| 18183 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, | |||
| 18184 | DAG.getConstant(0, DL, MVT::v32i8), Unpack, | |||
| 18185 | DAG.getIntPtrConstant(0, DL)); | |||
| 18186 | } | |||
| 18187 | ||||
| 18188 | // a = shuffle v1, v2, mask1 ; interleaving lower lanes of v1 and v2 | |||
| 18189 | // b = shuffle v1, v2, mask2 ; interleaving higher lanes of v1 and v2 | |||
| 18190 | // => | |||
| 18191 | // ul = unpckl v1, v2 | |||
| 18192 | // uh = unpckh v1, v2 | |||
| 18193 | // a = vperm ul, uh | |||
| 18194 | // b = vperm ul, uh | |||
| 18195 | // | |||
| 18196 | // Pattern-match interleave(256b v1, 256b v2) -> 512b v3 and lower it into unpck | |||
| 18197 | // and permute. We cannot directly match v3 because it is split into two | |||
| 18198 | // 256-bit vectors in earlier isel stages. Therefore, this function matches a | |||
| 18199 | // pair of 256-bit shuffles and makes sure the masks are consecutive. | |||
| 18200 | // | |||
| 18201 | // Once unpck and permute nodes are created, the permute corresponding to this | |||
| 18202 | // shuffle is returned, while the other permute replaces the other half of the | |||
| 18203 | // shuffle in the selection dag. | |||
| 18204 | static SDValue lowerShufflePairAsUNPCKAndPermute(const SDLoc &DL, MVT VT, | |||
| 18205 | SDValue V1, SDValue V2, | |||
| 18206 | ArrayRef<int> Mask, | |||
| 18207 | SelectionDAG &DAG) { | |||
| 18208 | if (VT != MVT::v8f32 && VT != MVT::v8i32 && VT != MVT::v16i16 && | |||
| 18209 | VT != MVT::v32i8) | |||
| 18210 | return SDValue(); | |||
| 18211 | // <B0, B1, B0+1, B1+1, ..., > | |||
| 18212 | auto IsInterleavingPattern = [&](ArrayRef<int> Mask, unsigned Begin0, | |||
| 18213 | unsigned Begin1) { | |||
| 18214 | size_t Size = Mask.size(); | |||
| 18215 | assert(Size % 2 == 0 && "Expected even mask size")(static_cast <bool> (Size % 2 == 0 && "Expected even mask size" ) ? void (0) : __assert_fail ("Size % 2 == 0 && \"Expected even mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18215, __extension__ __PRETTY_FUNCTION__)); | |||
| 18216 | for (unsigned I = 0; I < Size; I += 2) { | |||
| 18217 | if (Mask[I] != (int)(Begin0 + I / 2) || | |||
| 18218 | Mask[I + 1] != (int)(Begin1 + I / 2)) | |||
| 18219 | return false; | |||
| 18220 | } | |||
| 18221 | return true; | |||
| 18222 | }; | |||
| 18223 | // Check which half is this shuffle node | |||
| 18224 | int NumElts = VT.getVectorNumElements(); | |||
| 18225 | size_t FirstQtr = NumElts / 2; | |||
| 18226 | size_t ThirdQtr = NumElts + NumElts / 2; | |||
| 18227 | bool IsFirstHalf = IsInterleavingPattern(Mask, 0, NumElts); | |||
| 18228 | bool IsSecondHalf = IsInterleavingPattern(Mask, FirstQtr, ThirdQtr); | |||
| 18229 | if (!IsFirstHalf && !IsSecondHalf) | |||
| 18230 | return SDValue(); | |||
| 18231 | ||||
| 18232 | // Find the intersection between shuffle users of V1 and V2. | |||
| 18233 | SmallVector<SDNode *, 2> Shuffles; | |||
| 18234 | for (SDNode *User : V1->uses()) | |||
| 18235 | if (User->getOpcode() == ISD::VECTOR_SHUFFLE && User->getOperand(0) == V1 && | |||
| 18236 | User->getOperand(1) == V2) | |||
| 18237 | Shuffles.push_back(User); | |||
| 18238 | // Limit user size to two for now. | |||
| 18239 | if (Shuffles.size() != 2) | |||
| 18240 | return SDValue(); | |||
| 18241 | // Find out which half of the 512-bit shuffles is each smaller shuffle | |||
| 18242 | auto *SVN1 = cast<ShuffleVectorSDNode>(Shuffles[0]); | |||
| 18243 | auto *SVN2 = cast<ShuffleVectorSDNode>(Shuffles[1]); | |||
| 18244 | SDNode *FirstHalf; | |||
| 18245 | SDNode *SecondHalf; | |||
| 18246 | if (IsInterleavingPattern(SVN1->getMask(), 0, NumElts) && | |||
| 18247 | IsInterleavingPattern(SVN2->getMask(), FirstQtr, ThirdQtr)) { | |||
| 18248 | FirstHalf = Shuffles[0]; | |||
| 18249 | SecondHalf = Shuffles[1]; | |||
| 18250 | } else if (IsInterleavingPattern(SVN1->getMask(), FirstQtr, ThirdQtr) && | |||
| 18251 | IsInterleavingPattern(SVN2->getMask(), 0, NumElts)) { | |||
| 18252 | FirstHalf = Shuffles[1]; | |||
| 18253 | SecondHalf = Shuffles[0]; | |||
| 18254 | } else { | |||
| 18255 | return SDValue(); | |||
| 18256 | } | |||
| 18257 | // Lower into unpck and perm. Return the perm of this shuffle and replace | |||
| 18258 | // the other. | |||
| 18259 | SDValue Unpckl = DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2); | |||
| 18260 | SDValue Unpckh = DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2); | |||
| 18261 | SDValue Perm1 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, | |||
| 18262 | DAG.getTargetConstant(0x20, DL, MVT::i8)); | |||
| 18263 | SDValue Perm2 = DAG.getNode(X86ISD::VPERM2X128, DL, VT, Unpckl, Unpckh, | |||
| 18264 | DAG.getTargetConstant(0x31, DL, MVT::i8)); | |||
| 18265 | if (IsFirstHalf) { | |||
| 18266 | DAG.ReplaceAllUsesWith(SecondHalf, &Perm2); | |||
| 18267 | return Perm1; | |||
| 18268 | } | |||
| 18269 | DAG.ReplaceAllUsesWith(FirstHalf, &Perm1); | |||
| 18270 | return Perm2; | |||
| 18271 | } | |||
| 18272 | ||||
| 18273 | /// Handle lowering of 4-lane 64-bit floating point shuffles. | |||
| 18274 | /// | |||
| 18275 | /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 | |||
| 18276 | /// isn't available. | |||
| 18277 | static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18278 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18279 | const X86Subtarget &Subtarget, | |||
| 18280 | SelectionDAG &DAG) { | |||
| 18281 | assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18281, __extension__ __PRETTY_FUNCTION__)); | |||
| 18282 | assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v4f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18282, __extension__ __PRETTY_FUNCTION__)); | |||
| 18283 | assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18283, __extension__ __PRETTY_FUNCTION__)); | |||
| 18284 | ||||
| 18285 | if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4f64, V1, V2, Mask, Zeroable, | |||
| 18286 | Subtarget, DAG)) | |||
| 18287 | return V; | |||
| 18288 | ||||
| 18289 | if (V2.isUndef()) { | |||
| 18290 | // Check for being able to broadcast a single element. | |||
| 18291 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4f64, V1, V2, | |||
| 18292 | Mask, Subtarget, DAG)) | |||
| 18293 | return Broadcast; | |||
| 18294 | ||||
| 18295 | // Use low duplicate instructions for masks that match their pattern. | |||
| 18296 | if (isShuffleEquivalent(Mask, {0, 0, 2, 2}, V1, V2)) | |||
| 18297 | return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1); | |||
| 18298 | ||||
| 18299 | if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) { | |||
| 18300 | // Non-half-crossing single input shuffles can be lowered with an | |||
| 18301 | // interleaved permutation. | |||
| 18302 | unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | | |||
| 18303 | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3); | |||
| 18304 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1, | |||
| 18305 | DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); | |||
| 18306 | } | |||
| 18307 | ||||
| 18308 | // With AVX2 we have direct support for this permutation. | |||
| 18309 | if (Subtarget.hasAVX2()) | |||
| 18310 | return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1, | |||
| 18311 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 18312 | ||||
| 18313 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18314 | // results into the target lanes. | |||
| 18315 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18316 | DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) | |||
| 18317 | return V; | |||
| 18318 | ||||
| 18319 | // Try to permute the lanes and then use a per-lane permute. | |||
| 18320 | if (SDValue V = lowerShuffleAsLanePermuteAndPermute(DL, MVT::v4f64, V1, V2, | |||
| 18321 | Mask, DAG, Subtarget)) | |||
| 18322 | return V; | |||
| 18323 | ||||
| 18324 | // Otherwise, fall back. | |||
| 18325 | return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18326 | DAG, Subtarget); | |||
| 18327 | } | |||
| 18328 | ||||
| 18329 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18330 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG)) | |||
| 18331 | return V; | |||
| 18332 | ||||
| 18333 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18334 | Zeroable, Subtarget, DAG)) | |||
| 18335 | return Blend; | |||
| 18336 | ||||
| 18337 | // Check if the blend happens to exactly fit that of SHUFPD. | |||
| 18338 | if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18339 | Zeroable, Subtarget, DAG)) | |||
| 18340 | return Op; | |||
| 18341 | ||||
| 18342 | bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); | |||
| 18343 | bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); | |||
| 18344 | ||||
| 18345 | // If we have lane crossing shuffles AND they don't all come from the lower | |||
| 18346 | // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)). | |||
| 18347 | // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently | |||
| 18348 | // canonicalize to a blend of splat which isn't necessary for this combine. | |||
| 18349 | if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) && | |||
| 18350 | !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) && | |||
| 18351 | (V1.getOpcode() != ISD::BUILD_VECTOR) && | |||
| 18352 | (V2.getOpcode() != ISD::BUILD_VECTOR)) | |||
| 18353 | return lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2, Mask, DAG); | |||
| 18354 | ||||
| 18355 | // If we have one input in place, then we can permute the other input and | |||
| 18356 | // blend the result. | |||
| 18357 | if (V1IsInPlace || V2IsInPlace) | |||
| 18358 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18359 | Subtarget, DAG); | |||
| 18360 | ||||
| 18361 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18362 | // results into the target lanes. | |||
| 18363 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18364 | DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) | |||
| 18365 | return V; | |||
| 18366 | ||||
| 18367 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18368 | // shuffle. However, if we have AVX2 and either inputs are already in place, | |||
| 18369 | // we will be able to shuffle even across lanes the other input in a single | |||
| 18370 | // instruction so skip this pattern. | |||
| 18371 | if (!(Subtarget.hasAVX2() && (V1IsInPlace || V2IsInPlace))) | |||
| 18372 | if (SDValue V = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18373 | DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG)) | |||
| 18374 | return V; | |||
| 18375 | ||||
| 18376 | // If we have VLX support, we can use VEXPAND. | |||
| 18377 | if (Subtarget.hasVLX()) | |||
| 18378 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2, | |||
| 18379 | DAG, Subtarget)) | |||
| 18380 | return V; | |||
| 18381 | ||||
| 18382 | // If we have AVX2 then we always want to lower with a blend because an v4 we | |||
| 18383 | // can fully permute the elements. | |||
| 18384 | if (Subtarget.hasAVX2()) | |||
| 18385 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18386 | Subtarget, DAG); | |||
| 18387 | ||||
| 18388 | // Otherwise fall back on generic lowering. | |||
| 18389 | return lowerShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, | |||
| 18390 | Subtarget, DAG); | |||
| 18391 | } | |||
| 18392 | ||||
| 18393 | /// Handle lowering of 4-lane 64-bit integer shuffles. | |||
| 18394 | /// | |||
| 18395 | /// This routine is only called when we have AVX2 and thus a reasonable | |||
| 18396 | /// instruction set for v4i64 shuffling.. | |||
| 18397 | static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18398 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18399 | const X86Subtarget &Subtarget, | |||
| 18400 | SelectionDAG &DAG) { | |||
| 18401 | assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18401, __extension__ __PRETTY_FUNCTION__)); | |||
| 18402 | assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v4i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18402, __extension__ __PRETTY_FUNCTION__)); | |||
| 18403 | assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!")(static_cast <bool> (Mask.size() == 4 && "Unexpected mask size for v4 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 4 && \"Unexpected mask size for v4 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18403, __extension__ __PRETTY_FUNCTION__)); | |||
| 18404 | assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v4i64 with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18404, __extension__ __PRETTY_FUNCTION__)); | |||
| 18405 | ||||
| 18406 | if (SDValue V = lowerV2X128Shuffle(DL, MVT::v4i64, V1, V2, Mask, Zeroable, | |||
| 18407 | Subtarget, DAG)) | |||
| 18408 | return V; | |||
| 18409 | ||||
| 18410 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18411 | Zeroable, Subtarget, DAG)) | |||
| 18412 | return Blend; | |||
| 18413 | ||||
| 18414 | // Check for being able to broadcast a single element. | |||
| 18415 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18416 | Subtarget, DAG)) | |||
| 18417 | return Broadcast; | |||
| 18418 | ||||
| 18419 | // Try to use shift instructions if fast. | |||
| 18420 | if (Subtarget.preferLowerShuffleAsShift()) | |||
| 18421 | if (SDValue Shift = | |||
| 18422 | lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, | |||
| 18423 | Subtarget, DAG, /*BitwiseOnly*/ true)) | |||
| 18424 | return Shift; | |||
| 18425 | ||||
| 18426 | if (V2.isUndef()) { | |||
| 18427 | // When the shuffle is mirrored between the 128-bit lanes of the unit, we | |||
| 18428 | // can use lower latency instructions that will operate on both lanes. | |||
| 18429 | SmallVector<int, 2> RepeatedMask; | |||
| 18430 | if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { | |||
| 18431 | SmallVector<int, 4> PSHUFDMask; | |||
| 18432 | narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask); | |||
| 18433 | return DAG.getBitcast( | |||
| 18434 | MVT::v4i64, | |||
| 18435 | DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, | |||
| 18436 | DAG.getBitcast(MVT::v8i32, V1), | |||
| 18437 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); | |||
| 18438 | } | |||
| 18439 | ||||
| 18440 | // AVX2 provides a direct instruction for permuting a single input across | |||
| 18441 | // lanes. | |||
| 18442 | return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1, | |||
| 18443 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 18444 | } | |||
| 18445 | ||||
| 18446 | // Try to use shift instructions. | |||
| 18447 | if (SDValue Shift = | |||
| 18448 | lowerShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask, Zeroable, Subtarget, | |||
| 18449 | DAG, /*BitwiseOnly*/ false)) | |||
| 18450 | return Shift; | |||
| 18451 | ||||
| 18452 | // If we have VLX support, we can use VALIGN or VEXPAND. | |||
| 18453 | if (Subtarget.hasVLX()) { | |||
| 18454 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18455 | Subtarget, DAG)) | |||
| 18456 | return Rotate; | |||
| 18457 | ||||
| 18458 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2, | |||
| 18459 | DAG, Subtarget)) | |||
| 18460 | return V; | |||
| 18461 | } | |||
| 18462 | ||||
| 18463 | // Try to use PALIGNR. | |||
| 18464 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18465 | Subtarget, DAG)) | |||
| 18466 | return Rotate; | |||
| 18467 | ||||
| 18468 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18469 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG)) | |||
| 18470 | return V; | |||
| 18471 | ||||
| 18472 | bool V1IsInPlace = isShuffleMaskInputInPlace(0, Mask); | |||
| 18473 | bool V2IsInPlace = isShuffleMaskInputInPlace(1, Mask); | |||
| 18474 | ||||
| 18475 | // If we have one input in place, then we can permute the other input and | |||
| 18476 | // blend the result. | |||
| 18477 | if (V1IsInPlace || V2IsInPlace) | |||
| 18478 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18479 | Subtarget, DAG); | |||
| 18480 | ||||
| 18481 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18482 | // results into the target lanes. | |||
| 18483 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18484 | DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) | |||
| 18485 | return V; | |||
| 18486 | ||||
| 18487 | // Try to lower to PERMQ(BLENDD(V1,V2)). | |||
| 18488 | if (SDValue V = | |||
| 18489 | lowerShuffleAsBlendAndPermute(DL, MVT::v4i64, V1, V2, Mask, DAG)) | |||
| 18490 | return V; | |||
| 18491 | ||||
| 18492 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18493 | // shuffle. However, if we have AVX2 and either inputs are already in place, | |||
| 18494 | // we will be able to shuffle even across lanes the other input in a single | |||
| 18495 | // instruction so skip this pattern. | |||
| 18496 | if (!V1IsInPlace && !V2IsInPlace) | |||
| 18497 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18498 | DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG)) | |||
| 18499 | return Result; | |||
| 18500 | ||||
| 18501 | // Otherwise fall back on generic blend lowering. | |||
| 18502 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v4i64, V1, V2, Mask, | |||
| 18503 | Subtarget, DAG); | |||
| 18504 | } | |||
| 18505 | ||||
| 18506 | /// Handle lowering of 8-lane 32-bit floating point shuffles. | |||
| 18507 | /// | |||
| 18508 | /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2 | |||
| 18509 | /// isn't available. | |||
| 18510 | static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18511 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18512 | const X86Subtarget &Subtarget, | |||
| 18513 | SelectionDAG &DAG) { | |||
| 18514 | assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18514, __extension__ __PRETTY_FUNCTION__)); | |||
| 18515 | assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18515, __extension__ __PRETTY_FUNCTION__)); | |||
| 18516 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18516, __extension__ __PRETTY_FUNCTION__)); | |||
| 18517 | ||||
| 18518 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask, | |||
| 18519 | Zeroable, Subtarget, DAG)) | |||
| 18520 | return Blend; | |||
| 18521 | ||||
| 18522 | // Check for being able to broadcast a single element. | |||
| 18523 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8f32, V1, V2, Mask, | |||
| 18524 | Subtarget, DAG)) | |||
| 18525 | return Broadcast; | |||
| 18526 | ||||
| 18527 | if (!Subtarget.hasAVX2()) { | |||
| 18528 | SmallVector<int> InLaneMask; | |||
| 18529 | computeInLaneShuffleMask(Mask, Mask.size() / 2, InLaneMask); | |||
| 18530 | ||||
| 18531 | if (!is128BitLaneRepeatedShuffleMask(MVT::v8f32, InLaneMask)) | |||
| 18532 | if (SDValue R = splitAndLowerShuffle(DL, MVT::v8f32, V1, V2, Mask, DAG, | |||
| 18533 | /*SimpleOnly*/ true)) | |||
| 18534 | return R; | |||
| 18535 | } | |||
| 18536 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18537 | Zeroable, Subtarget, DAG)) | |||
| 18538 | return DAG.getBitcast(MVT::v8f32, ZExt); | |||
| 18539 | ||||
| 18540 | // If the shuffle mask is repeated in each 128-bit lane, we have many more | |||
| 18541 | // options to efficiently lower the shuffle. | |||
| 18542 | SmallVector<int, 4> RepeatedMask; | |||
| 18543 | if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) { | |||
| 18544 | assert(RepeatedMask.size() == 4 &&(static_cast <bool> (RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!") ? void (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__ __PRETTY_FUNCTION__)) | |||
| 18545 | "Repeated masks must be half the mask width!")(static_cast <bool> (RepeatedMask.size() == 4 && "Repeated masks must be half the mask width!") ? void (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Repeated masks must be half the mask width!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18545, __extension__ __PRETTY_FUNCTION__)); | |||
| 18546 | ||||
| 18547 | // Use even/odd duplicate instructions for masks that match their pattern. | |||
| 18548 | if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) | |||
| 18549 | return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1); | |||
| 18550 | if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) | |||
| 18551 | return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1); | |||
| 18552 | ||||
| 18553 | if (V2.isUndef()) | |||
| 18554 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1, | |||
| 18555 | getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); | |||
| 18556 | ||||
| 18557 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18558 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG)) | |||
| 18559 | return V; | |||
| 18560 | ||||
| 18561 | // Otherwise, fall back to a SHUFPS sequence. Here it is important that we | |||
| 18562 | // have already handled any direct blends. | |||
| 18563 | return lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG); | |||
| 18564 | } | |||
| 18565 | ||||
| 18566 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18567 | // results into the target lanes. | |||
| 18568 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18569 | DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) | |||
| 18570 | return V; | |||
| 18571 | ||||
| 18572 | // If we have a single input shuffle with different shuffle patterns in the | |||
| 18573 | // two 128-bit lanes use the variable mask to VPERMILPS. | |||
| 18574 | if (V2.isUndef()) { | |||
| 18575 | if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) { | |||
| 18576 | SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); | |||
| 18577 | return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask); | |||
| 18578 | } | |||
| 18579 | if (Subtarget.hasAVX2()) { | |||
| 18580 | SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); | |||
| 18581 | return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1); | |||
| 18582 | } | |||
| 18583 | // Otherwise, fall back. | |||
| 18584 | return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask, | |||
| 18585 | DAG, Subtarget); | |||
| 18586 | } | |||
| 18587 | ||||
| 18588 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18589 | // shuffle. | |||
| 18590 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18591 | DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG)) | |||
| 18592 | return Result; | |||
| 18593 | ||||
| 18594 | // If we have VLX support, we can use VEXPAND. | |||
| 18595 | if (Subtarget.hasVLX()) | |||
| 18596 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2, | |||
| 18597 | DAG, Subtarget)) | |||
| 18598 | return V; | |||
| 18599 | ||||
| 18600 | // Try to match an interleave of two v8f32s and lower them as unpck and | |||
| 18601 | // permutes using ymms. This needs to go before we try to split the vectors. | |||
| 18602 | // | |||
| 18603 | // TODO: Expand this to AVX1. Currently v8i32 is casted to v8f32 and hits | |||
| 18604 | // this path inadvertently. | |||
| 18605 | if (Subtarget.hasAVX2() && !Subtarget.hasAVX512()) | |||
| 18606 | if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8f32, V1, V2, | |||
| 18607 | Mask, DAG)) | |||
| 18608 | return V; | |||
| 18609 | ||||
| 18610 | // For non-AVX512 if the Mask is of 16bit elements in lane then try to split | |||
| 18611 | // since after split we get a more efficient code using vpunpcklwd and | |||
| 18612 | // vpunpckhwd instrs than vblend. | |||
| 18613 | if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32, DAG)) | |||
| 18614 | return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, Subtarget, | |||
| 18615 | DAG); | |||
| 18616 | ||||
| 18617 | // If we have AVX2 then we always want to lower with a blend because at v8 we | |||
| 18618 | // can fully permute the elements. | |||
| 18619 | if (Subtarget.hasAVX2()) | |||
| 18620 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8f32, V1, V2, Mask, | |||
| 18621 | Subtarget, DAG); | |||
| 18622 | ||||
| 18623 | // Otherwise fall back on generic lowering. | |||
| 18624 | return lowerShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, | |||
| 18625 | Subtarget, DAG); | |||
| 18626 | } | |||
| 18627 | ||||
| 18628 | /// Handle lowering of 8-lane 32-bit integer shuffles. | |||
| 18629 | /// | |||
| 18630 | /// This routine is only called when we have AVX2 and thus a reasonable | |||
| 18631 | /// instruction set for v8i32 shuffling.. | |||
| 18632 | static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18633 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18634 | const X86Subtarget &Subtarget, | |||
| 18635 | SelectionDAG &DAG) { | |||
| 18636 | assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18636, __extension__ __PRETTY_FUNCTION__)); | |||
| 18637 | assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18637, __extension__ __PRETTY_FUNCTION__)); | |||
| 18638 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18638, __extension__ __PRETTY_FUNCTION__)); | |||
| 18639 | assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v8i32 with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18639, __extension__ __PRETTY_FUNCTION__)); | |||
| 18640 | ||||
| 18641 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 8; }); | |||
| 18642 | ||||
| 18643 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 18644 | // than any alternative. It also allows us to fold memory operands into the | |||
| 18645 | // shuffle in many cases. | |||
| 18646 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18647 | Zeroable, Subtarget, DAG)) | |||
| 18648 | return ZExt; | |||
| 18649 | ||||
| 18650 | // Try to match an interleave of two v8i32s and lower them as unpck and | |||
| 18651 | // permutes using ymms. This needs to go before we try to split the vectors. | |||
| 18652 | if (!Subtarget.hasAVX512()) | |||
| 18653 | if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v8i32, V1, V2, | |||
| 18654 | Mask, DAG)) | |||
| 18655 | return V; | |||
| 18656 | ||||
| 18657 | // For non-AVX512 if the Mask is of 16bit elements in lane then try to split | |||
| 18658 | // since after split we get a more efficient code than vblend by using | |||
| 18659 | // vpunpcklwd and vpunpckhwd instrs. | |||
| 18660 | if (isUnpackWdShuffleMask(Mask, MVT::v8i32, DAG) && !V2.isUndef() && | |||
| 18661 | !Subtarget.hasAVX512()) | |||
| 18662 | return lowerShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, Subtarget, | |||
| 18663 | DAG); | |||
| 18664 | ||||
| 18665 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18666 | Zeroable, Subtarget, DAG)) | |||
| 18667 | return Blend; | |||
| 18668 | ||||
| 18669 | // Check for being able to broadcast a single element. | |||
| 18670 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18671 | Subtarget, DAG)) | |||
| 18672 | return Broadcast; | |||
| 18673 | ||||
| 18674 | // Try to use shift instructions if fast. | |||
| 18675 | if (Subtarget.preferLowerShuffleAsShift()) { | |||
| 18676 | if (SDValue Shift = | |||
| 18677 | lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, | |||
| 18678 | Subtarget, DAG, /*BitwiseOnly*/ true)) | |||
| 18679 | return Shift; | |||
| 18680 | if (NumV2Elements == 0) | |||
| 18681 | if (SDValue Rotate = | |||
| 18682 | lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) | |||
| 18683 | return Rotate; | |||
| 18684 | } | |||
| 18685 | ||||
| 18686 | // If the shuffle mask is repeated in each 128-bit lane we can use more | |||
| 18687 | // efficient instructions that mirror the shuffles across the two 128-bit | |||
| 18688 | // lanes. | |||
| 18689 | SmallVector<int, 4> RepeatedMask; | |||
| 18690 | bool Is128BitLaneRepeatedShuffle = | |||
| 18691 | is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask); | |||
| 18692 | if (Is128BitLaneRepeatedShuffle) { | |||
| 18693 | assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 && "Unexpected repeated mask size!") ? void (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18693, __extension__ __PRETTY_FUNCTION__)); | |||
| 18694 | if (V2.isUndef()) | |||
| 18695 | return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1, | |||
| 18696 | getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); | |||
| 18697 | ||||
| 18698 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18699 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG)) | |||
| 18700 | return V; | |||
| 18701 | } | |||
| 18702 | ||||
| 18703 | // Try to use shift instructions. | |||
| 18704 | if (SDValue Shift = | |||
| 18705 | lowerShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, | |||
| 18706 | DAG, /*BitwiseOnly*/ false)) | |||
| 18707 | return Shift; | |||
| 18708 | ||||
| 18709 | if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements == 0) | |||
| 18710 | if (SDValue Rotate = | |||
| 18711 | lowerShuffleAsBitRotate(DL, MVT::v8i32, V1, Mask, Subtarget, DAG)) | |||
| 18712 | return Rotate; | |||
| 18713 | ||||
| 18714 | // If we have VLX support, we can use VALIGN or EXPAND. | |||
| 18715 | if (Subtarget.hasVLX()) { | |||
| 18716 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18717 | Subtarget, DAG)) | |||
| 18718 | return Rotate; | |||
| 18719 | ||||
| 18720 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2, | |||
| 18721 | DAG, Subtarget)) | |||
| 18722 | return V; | |||
| 18723 | } | |||
| 18724 | ||||
| 18725 | // Try to use byte rotation instructions. | |||
| 18726 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18727 | Subtarget, DAG)) | |||
| 18728 | return Rotate; | |||
| 18729 | ||||
| 18730 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18731 | // results into the target lanes. | |||
| 18732 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18733 | DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) | |||
| 18734 | return V; | |||
| 18735 | ||||
| 18736 | if (V2.isUndef()) { | |||
| 18737 | // Try to produce a fixed cross-128-bit lane permute followed by unpack | |||
| 18738 | // because that should be faster than the variable permute alternatives. | |||
| 18739 | if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG)) | |||
| 18740 | return V; | |||
| 18741 | ||||
| 18742 | // If the shuffle patterns aren't repeated but it's a single input, directly | |||
| 18743 | // generate a cross-lane VPERMD instruction. | |||
| 18744 | SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true); | |||
| 18745 | return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1); | |||
| 18746 | } | |||
| 18747 | ||||
| 18748 | // Assume that a single SHUFPS is faster than an alternative sequence of | |||
| 18749 | // multiple instructions (even if the CPU has a domain penalty). | |||
| 18750 | // If some CPU is harmed by the domain switch, we can fix it in a later pass. | |||
| 18751 | if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { | |||
| 18752 | SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1); | |||
| 18753 | SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2); | |||
| 18754 | SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, | |||
| 18755 | CastV1, CastV2, DAG); | |||
| 18756 | return DAG.getBitcast(MVT::v8i32, ShufPS); | |||
| 18757 | } | |||
| 18758 | ||||
| 18759 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18760 | // shuffle. | |||
| 18761 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18762 | DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG)) | |||
| 18763 | return Result; | |||
| 18764 | ||||
| 18765 | // Otherwise fall back on generic blend lowering. | |||
| 18766 | return lowerShuffleAsDecomposedShuffleMerge(DL, MVT::v8i32, V1, V2, Mask, | |||
| 18767 | Subtarget, DAG); | |||
| 18768 | } | |||
| 18769 | ||||
| 18770 | /// Handle lowering of 16-lane 16-bit integer shuffles. | |||
| 18771 | /// | |||
| 18772 | /// This routine is only called when we have AVX2 and thus a reasonable | |||
| 18773 | /// instruction set for v16i16 shuffling.. | |||
| 18774 | static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18775 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18776 | const X86Subtarget &Subtarget, | |||
| 18777 | SelectionDAG &DAG) { | |||
| 18778 | assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18778, __extension__ __PRETTY_FUNCTION__)); | |||
| 18779 | assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v16i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18779, __extension__ __PRETTY_FUNCTION__)); | |||
| 18780 | assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18780, __extension__ __PRETTY_FUNCTION__)); | |||
| 18781 | assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v16i16 with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18781, __extension__ __PRETTY_FUNCTION__)); | |||
| 18782 | ||||
| 18783 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 18784 | // than any alternative. It also allows us to fold memory operands into the | |||
| 18785 | // shuffle in many cases. | |||
| 18786 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 18787 | DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 18788 | return ZExt; | |||
| 18789 | ||||
| 18790 | // Check for being able to broadcast a single element. | |||
| 18791 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v16i16, V1, V2, Mask, | |||
| 18792 | Subtarget, DAG)) | |||
| 18793 | return Broadcast; | |||
| 18794 | ||||
| 18795 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask, | |||
| 18796 | Zeroable, Subtarget, DAG)) | |||
| 18797 | return Blend; | |||
| 18798 | ||||
| 18799 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18800 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) | |||
| 18801 | return V; | |||
| 18802 | ||||
| 18803 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 18804 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, | |||
| 18805 | Subtarget)) | |||
| 18806 | return V; | |||
| 18807 | ||||
| 18808 | // Try to use lower using a truncation. | |||
| 18809 | if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v16i16, V1, V2, Mask, Zeroable, | |||
| 18810 | Subtarget, DAG)) | |||
| 18811 | return V; | |||
| 18812 | ||||
| 18813 | // Try to use shift instructions. | |||
| 18814 | if (SDValue Shift = | |||
| 18815 | lowerShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, | |||
| 18816 | Subtarget, DAG, /*BitwiseOnly*/ false)) | |||
| 18817 | return Shift; | |||
| 18818 | ||||
| 18819 | // Try to use byte rotation instructions. | |||
| 18820 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i16, V1, V2, Mask, | |||
| 18821 | Subtarget, DAG)) | |||
| 18822 | return Rotate; | |||
| 18823 | ||||
| 18824 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18825 | // results into the target lanes. | |||
| 18826 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18827 | DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) | |||
| 18828 | return V; | |||
| 18829 | ||||
| 18830 | if (V2.isUndef()) { | |||
| 18831 | // Try to use bit rotation instructions. | |||
| 18832 | if (SDValue Rotate = | |||
| 18833 | lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG)) | |||
| 18834 | return Rotate; | |||
| 18835 | ||||
| 18836 | // Try to produce a fixed cross-128-bit lane permute followed by unpack | |||
| 18837 | // because that should be faster than the variable permute alternatives. | |||
| 18838 | if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG)) | |||
| 18839 | return V; | |||
| 18840 | ||||
| 18841 | // There are no generalized cross-lane shuffle operations available on i16 | |||
| 18842 | // element types. | |||
| 18843 | if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) { | |||
| 18844 | if (SDValue V = lowerShuffleAsLanePermuteAndPermute( | |||
| 18845 | DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) | |||
| 18846 | return V; | |||
| 18847 | ||||
| 18848 | return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v16i16, V1, V2, Mask, | |||
| 18849 | DAG, Subtarget); | |||
| 18850 | } | |||
| 18851 | ||||
| 18852 | SmallVector<int, 8> RepeatedMask; | |||
| 18853 | if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { | |||
| 18854 | // As this is a single-input shuffle, the repeated mask should be | |||
| 18855 | // a strictly valid v8i16 mask that we can pass through to the v8i16 | |||
| 18856 | // lowering to handle even the v16 case. | |||
| 18857 | return lowerV8I16GeneralSingleInputShuffle( | |||
| 18858 | DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG); | |||
| 18859 | } | |||
| 18860 | } | |||
| 18861 | ||||
| 18862 | if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v16i16, Mask, V1, V2, | |||
| 18863 | Zeroable, Subtarget, DAG)) | |||
| 18864 | return PSHUFB; | |||
| 18865 | ||||
| 18866 | // AVX512BW can lower to VPERMW (non-VLX will pad to v32i16). | |||
| 18867 | if (Subtarget.hasBWI()) | |||
| 18868 | return lowerShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, Subtarget, DAG); | |||
| 18869 | ||||
| 18870 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18871 | // shuffle. | |||
| 18872 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18873 | DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG)) | |||
| 18874 | return Result; | |||
| 18875 | ||||
| 18876 | // Try to permute the lanes and then use a per-lane permute. | |||
| 18877 | if (SDValue V = lowerShuffleAsLanePermuteAndPermute( | |||
| 18878 | DL, MVT::v16i16, V1, V2, Mask, DAG, Subtarget)) | |||
| 18879 | return V; | |||
| 18880 | ||||
| 18881 | // Try to match an interleave of two v16i16s and lower them as unpck and | |||
| 18882 | // permutes using ymms. | |||
| 18883 | if (!Subtarget.hasAVX512()) | |||
| 18884 | if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v16i16, V1, V2, | |||
| 18885 | Mask, DAG)) | |||
| 18886 | return V; | |||
| 18887 | ||||
| 18888 | // Otherwise fall back on generic lowering. | |||
| 18889 | return lowerShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, | |||
| 18890 | Subtarget, DAG); | |||
| 18891 | } | |||
| 18892 | ||||
| 18893 | /// Handle lowering of 32-lane 8-bit integer shuffles. | |||
| 18894 | /// | |||
| 18895 | /// This routine is only called when we have AVX2 and thus a reasonable | |||
| 18896 | /// instruction set for v32i8 shuffling.. | |||
| 18897 | static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 18898 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 18899 | const X86Subtarget &Subtarget, | |||
| 18900 | SelectionDAG &DAG) { | |||
| 18901 | assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18901, __extension__ __PRETTY_FUNCTION__)); | |||
| 18902 | assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v32i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18902, __extension__ __PRETTY_FUNCTION__)); | |||
| 18903 | assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18903, __extension__ __PRETTY_FUNCTION__)); | |||
| 18904 | assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!")(static_cast <bool> (Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"We can only lower v32i8 with AVX2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 18904, __extension__ __PRETTY_FUNCTION__)); | |||
| 18905 | ||||
| 18906 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 18907 | // than any alternative. It also allows us to fold memory operands into the | |||
| 18908 | // shuffle in many cases. | |||
| 18909 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend(DL, MVT::v32i8, V1, V2, Mask, | |||
| 18910 | Zeroable, Subtarget, DAG)) | |||
| 18911 | return ZExt; | |||
| 18912 | ||||
| 18913 | // Check for being able to broadcast a single element. | |||
| 18914 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, MVT::v32i8, V1, V2, Mask, | |||
| 18915 | Subtarget, DAG)) | |||
| 18916 | return Broadcast; | |||
| 18917 | ||||
| 18918 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask, | |||
| 18919 | Zeroable, Subtarget, DAG)) | |||
| 18920 | return Blend; | |||
| 18921 | ||||
| 18922 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 18923 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) | |||
| 18924 | return V; | |||
| 18925 | ||||
| 18926 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 18927 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, | |||
| 18928 | Subtarget)) | |||
| 18929 | return V; | |||
| 18930 | ||||
| 18931 | // Try to use lower using a truncation. | |||
| 18932 | if (SDValue V = lowerShuffleAsVTRUNC(DL, MVT::v32i8, V1, V2, Mask, Zeroable, | |||
| 18933 | Subtarget, DAG)) | |||
| 18934 | return V; | |||
| 18935 | ||||
| 18936 | // Try to use shift instructions. | |||
| 18937 | if (SDValue Shift = | |||
| 18938 | lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, | |||
| 18939 | DAG, /*BitwiseOnly*/ false)) | |||
| 18940 | return Shift; | |||
| 18941 | ||||
| 18942 | // Try to use byte rotation instructions. | |||
| 18943 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i8, V1, V2, Mask, | |||
| 18944 | Subtarget, DAG)) | |||
| 18945 | return Rotate; | |||
| 18946 | ||||
| 18947 | // Try to use bit rotation instructions. | |||
| 18948 | if (V2.isUndef()) | |||
| 18949 | if (SDValue Rotate = | |||
| 18950 | lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG)) | |||
| 18951 | return Rotate; | |||
| 18952 | ||||
| 18953 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 18954 | // results into the target lanes. | |||
| 18955 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 18956 | DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 18957 | return V; | |||
| 18958 | ||||
| 18959 | // There are no generalized cross-lane shuffle operations available on i8 | |||
| 18960 | // element types. | |||
| 18961 | if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) { | |||
| 18962 | // Try to produce a fixed cross-128-bit lane permute followed by unpack | |||
| 18963 | // because that should be faster than the variable permute alternatives. | |||
| 18964 | if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG)) | |||
| 18965 | return V; | |||
| 18966 | ||||
| 18967 | if (SDValue V = lowerShuffleAsLanePermuteAndPermute( | |||
| 18968 | DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) | |||
| 18969 | return V; | |||
| 18970 | ||||
| 18971 | return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v32i8, V1, V2, Mask, | |||
| 18972 | DAG, Subtarget); | |||
| 18973 | } | |||
| 18974 | ||||
| 18975 | if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i8, Mask, V1, V2, | |||
| 18976 | Zeroable, Subtarget, DAG)) | |||
| 18977 | return PSHUFB; | |||
| 18978 | ||||
| 18979 | // AVX512VBMI can lower to VPERMB (non-VLX will pad to v64i8). | |||
| 18980 | if (Subtarget.hasVBMI()) | |||
| 18981 | return lowerShuffleWithPERMV(DL, MVT::v32i8, Mask, V1, V2, Subtarget, DAG); | |||
| 18982 | ||||
| 18983 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 18984 | // shuffle. | |||
| 18985 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 18986 | DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 18987 | return Result; | |||
| 18988 | ||||
| 18989 | // Try to permute the lanes and then use a per-lane permute. | |||
| 18990 | if (SDValue V = lowerShuffleAsLanePermuteAndPermute( | |||
| 18991 | DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) | |||
| 18992 | return V; | |||
| 18993 | ||||
| 18994 | // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed | |||
| 18995 | // by zeroable elements in the remaining 24 elements. Turn this into two | |||
| 18996 | // vmovqb instructions shuffled together. | |||
| 18997 | if (Subtarget.hasVLX()) | |||
| 18998 | if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, | |||
| 18999 | Mask, Zeroable, DAG)) | |||
| 19000 | return V; | |||
| 19001 | ||||
| 19002 | // Try to match an interleave of two v32i8s and lower them as unpck and | |||
| 19003 | // permutes using ymms. | |||
| 19004 | if (!Subtarget.hasAVX512()) | |||
| 19005 | if (SDValue V = lowerShufflePairAsUNPCKAndPermute(DL, MVT::v32i8, V1, V2, | |||
| 19006 | Mask, DAG)) | |||
| 19007 | return V; | |||
| 19008 | ||||
| 19009 | // Otherwise fall back on generic lowering. | |||
| 19010 | return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, | |||
| 19011 | Subtarget, DAG); | |||
| 19012 | } | |||
| 19013 | ||||
| 19014 | /// High-level routine to lower various 256-bit x86 vector shuffles. | |||
| 19015 | /// | |||
| 19016 | /// This routine either breaks down the specific type of a 256-bit x86 vector | |||
| 19017 | /// shuffle or splits it into two 128-bit shuffles and fuses the results back | |||
| 19018 | /// together based on the available instructions. | |||
| 19019 | static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT, | |||
| 19020 | SDValue V1, SDValue V2, const APInt &Zeroable, | |||
| 19021 | const X86Subtarget &Subtarget, | |||
| 19022 | SelectionDAG &DAG) { | |||
| 19023 | // If we have a single input to the zero element, insert that into V1 if we | |||
| 19024 | // can do so cheaply. | |||
| 19025 | int NumElts = VT.getVectorNumElements(); | |||
| 19026 | int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); | |||
| 19027 | ||||
| 19028 | if (NumV2Elements == 1 && Mask[0] >= NumElts) | |||
| 19029 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 19030 | DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19031 | return Insertion; | |||
| 19032 | ||||
| 19033 | // Handle special cases where the lower or upper half is UNDEF. | |||
| 19034 | if (SDValue V = | |||
| 19035 | lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) | |||
| 19036 | return V; | |||
| 19037 | ||||
| 19038 | // There is a really nice hard cut-over between AVX1 and AVX2 that means we | |||
| 19039 | // can check for those subtargets here and avoid much of the subtarget | |||
| 19040 | // querying in the per-vector-type lowering routines. With AVX1 we have | |||
| 19041 | // essentially *zero* ability to manipulate a 256-bit vector with integer | |||
| 19042 | // types. Since we'll use floating point types there eventually, just | |||
| 19043 | // immediately cast everything to a float and operate entirely in that domain. | |||
| 19044 | if (VT.isInteger() && !Subtarget.hasAVX2()) { | |||
| 19045 | int ElementBits = VT.getScalarSizeInBits(); | |||
| 19046 | if (ElementBits < 32) { | |||
| 19047 | // No floating point type available, if we can't use the bit operations | |||
| 19048 | // for masking/blending then decompose into 128-bit vectors. | |||
| 19049 | if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, | |||
| 19050 | Subtarget, DAG)) | |||
| 19051 | return V; | |||
| 19052 | if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) | |||
| 19053 | return V; | |||
| 19054 | return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); | |||
| 19055 | } | |||
| 19056 | ||||
| 19057 | MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits), | |||
| 19058 | VT.getVectorNumElements()); | |||
| 19059 | V1 = DAG.getBitcast(FpVT, V1); | |||
| 19060 | V2 = DAG.getBitcast(FpVT, V2); | |||
| 19061 | return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask)); | |||
| 19062 | } | |||
| 19063 | ||||
| 19064 | if (VT == MVT::v16f16) { | |||
| 19065 | V1 = DAG.getBitcast(MVT::v16i16, V1); | |||
| 19066 | V2 = DAG.getBitcast(MVT::v16i16, V2); | |||
| 19067 | return DAG.getBitcast(MVT::v16f16, | |||
| 19068 | DAG.getVectorShuffle(MVT::v16i16, DL, V1, V2, Mask)); | |||
| 19069 | } | |||
| 19070 | ||||
| 19071 | switch (VT.SimpleTy) { | |||
| 19072 | case MVT::v4f64: | |||
| 19073 | return lowerV4F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19074 | case MVT::v4i64: | |||
| 19075 | return lowerV4I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19076 | case MVT::v8f32: | |||
| 19077 | return lowerV8F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19078 | case MVT::v8i32: | |||
| 19079 | return lowerV8I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19080 | case MVT::v16i16: | |||
| 19081 | return lowerV16I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19082 | case MVT::v32i8: | |||
| 19083 | return lowerV32I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19084 | ||||
| 19085 | default: | |||
| 19086 | llvm_unreachable("Not a valid 256-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 256-bit x86 vector type!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19086); | |||
| 19087 | } | |||
| 19088 | } | |||
| 19089 | ||||
| 19090 | /// Try to lower a vector shuffle as a 128-bit shuffles. | |||
| 19091 | static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, | |||
| 19092 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19093 | const X86Subtarget &Subtarget, | |||
| 19094 | SelectionDAG &DAG) { | |||
| 19095 | assert(VT.getScalarSizeInBits() == 64 &&(static_cast <bool> (VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle.") ? void ( 0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__ __PRETTY_FUNCTION__)) | |||
| 19096 | "Unexpected element type size for 128bit shuffle.")(static_cast <bool> (VT.getScalarSizeInBits() == 64 && "Unexpected element type size for 128bit shuffle.") ? void ( 0) : __assert_fail ("VT.getScalarSizeInBits() == 64 && \"Unexpected element type size for 128bit shuffle.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19096, __extension__ __PRETTY_FUNCTION__)); | |||
| 19097 | ||||
| 19098 | // To handle 256 bit vector requires VLX and most probably | |||
| 19099 | // function lowerV2X128VectorShuffle() is better solution. | |||
| 19100 | assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.")(static_cast <bool> (VT.is512BitVector() && "Unexpected vector size for 512bit shuffle." ) ? void (0) : __assert_fail ("VT.is512BitVector() && \"Unexpected vector size for 512bit shuffle.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19100, __extension__ __PRETTY_FUNCTION__)); | |||
| 19101 | ||||
| 19102 | // TODO - use Zeroable like we do for lowerV2X128VectorShuffle? | |||
| 19103 | SmallVector<int, 4> Widened128Mask; | |||
| 19104 | if (!canWidenShuffleElements(Mask, Widened128Mask)) | |||
| 19105 | return SDValue(); | |||
| 19106 | assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch")(static_cast <bool> (Widened128Mask.size() == 4 && "Shuffle widening mismatch") ? void (0) : __assert_fail ("Widened128Mask.size() == 4 && \"Shuffle widening mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19106, __extension__ __PRETTY_FUNCTION__)); | |||
| 19107 | ||||
| 19108 | // Try to use an insert into a zero vector. | |||
| 19109 | if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 && | |||
| 19110 | (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) { | |||
| 19111 | unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4; | |||
| 19112 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); | |||
| 19113 | SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1, | |||
| 19114 | DAG.getIntPtrConstant(0, DL)); | |||
| 19115 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, | |||
| 19116 | getZeroVector(VT, Subtarget, DAG, DL), LoV, | |||
| 19117 | DAG.getIntPtrConstant(0, DL)); | |||
| 19118 | } | |||
| 19119 | ||||
| 19120 | // Check for patterns which can be matched with a single insert of a 256-bit | |||
| 19121 | // subvector. | |||
| 19122 | bool OnlyUsesV1 = isShuffleEquivalent(Mask, {0, 1, 2, 3, 0, 1, 2, 3}, V1, V2); | |||
| 19123 | if (OnlyUsesV1 || | |||
| 19124 | isShuffleEquivalent(Mask, {0, 1, 2, 3, 8, 9, 10, 11}, V1, V2)) { | |||
| 19125 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4); | |||
| 19126 | SDValue SubVec = | |||
| 19127 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2, | |||
| 19128 | DAG.getIntPtrConstant(0, DL)); | |||
| 19129 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec, | |||
| 19130 | DAG.getIntPtrConstant(4, DL)); | |||
| 19131 | } | |||
| 19132 | ||||
| 19133 | // See if this is an insertion of the lower 128-bits of V2 into V1. | |||
| 19134 | bool IsInsert = true; | |||
| 19135 | int V2Index = -1; | |||
| 19136 | for (int i = 0; i < 4; ++i) { | |||
| 19137 | assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value") ? void (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19137, __extension__ __PRETTY_FUNCTION__)); | |||
| 19138 | if (Widened128Mask[i] < 0) | |||
| 19139 | continue; | |||
| 19140 | ||||
| 19141 | // Make sure all V1 subvectors are in place. | |||
| 19142 | if (Widened128Mask[i] < 4) { | |||
| 19143 | if (Widened128Mask[i] != i) { | |||
| 19144 | IsInsert = false; | |||
| 19145 | break; | |||
| 19146 | } | |||
| 19147 | } else { | |||
| 19148 | // Make sure we only have a single V2 index and its the lowest 128-bits. | |||
| 19149 | if (V2Index >= 0 || Widened128Mask[i] != 4) { | |||
| 19150 | IsInsert = false; | |||
| 19151 | break; | |||
| 19152 | } | |||
| 19153 | V2Index = i; | |||
| 19154 | } | |||
| 19155 | } | |||
| 19156 | if (IsInsert && V2Index >= 0) { | |||
| 19157 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2); | |||
| 19158 | SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2, | |||
| 19159 | DAG.getIntPtrConstant(0, DL)); | |||
| 19160 | return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL); | |||
| 19161 | } | |||
| 19162 | ||||
| 19163 | // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane | |||
| 19164 | // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where | |||
| 19165 | // possible we at least ensure the lanes stay sequential to help later | |||
| 19166 | // combines. | |||
| 19167 | SmallVector<int, 2> Widened256Mask; | |||
| 19168 | if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) { | |||
| 19169 | Widened128Mask.clear(); | |||
| 19170 | narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask); | |||
| 19171 | } | |||
| 19172 | ||||
| 19173 | // Try to lower to vshuf64x2/vshuf32x4. | |||
| 19174 | SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)}; | |||
| 19175 | unsigned PermMask = 0; | |||
| 19176 | // Insure elements came from the same Op. | |||
| 19177 | for (int i = 0; i < 4; ++i) { | |||
| 19178 | assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value") ? void (0) : __assert_fail ("Widened128Mask[i] >= -1 && \"Illegal shuffle sentinel value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19178, __extension__ __PRETTY_FUNCTION__)); | |||
| 19179 | if (Widened128Mask[i] < 0) | |||
| 19180 | continue; | |||
| 19181 | ||||
| 19182 | SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1; | |||
| 19183 | unsigned OpIndex = i / 2; | |||
| 19184 | if (Ops[OpIndex].isUndef()) | |||
| 19185 | Ops[OpIndex] = Op; | |||
| 19186 | else if (Ops[OpIndex] != Op) | |||
| 19187 | return SDValue(); | |||
| 19188 | ||||
| 19189 | // Convert the 128-bit shuffle mask selection values into 128-bit selection | |||
| 19190 | // bits defined by a vshuf64x2 instruction's immediate control byte. | |||
| 19191 | PermMask |= (Widened128Mask[i] % 4) << (i * 2); | |||
| 19192 | } | |||
| 19193 | ||||
| 19194 | return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1], | |||
| 19195 | DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 19196 | } | |||
| 19197 | ||||
| 19198 | /// Handle lowering of 8-lane 64-bit floating point shuffles. | |||
| 19199 | static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19200 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19201 | const X86Subtarget &Subtarget, | |||
| 19202 | SelectionDAG &DAG) { | |||
| 19203 | assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19203, __extension__ __PRETTY_FUNCTION__)); | |||
| 19204 | assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8f64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19204, __extension__ __PRETTY_FUNCTION__)); | |||
| 19205 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19205, __extension__ __PRETTY_FUNCTION__)); | |||
| 19206 | ||||
| 19207 | if (V2.isUndef()) { | |||
| 19208 | // Use low duplicate instructions for masks that match their pattern. | |||
| 19209 | if (isShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6}, V1, V2)) | |||
| 19210 | return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1); | |||
| 19211 | ||||
| 19212 | if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) { | |||
| 19213 | // Non-half-crossing single input shuffles can be lowered with an | |||
| 19214 | // interleaved permutation. | |||
| 19215 | unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) | | |||
| 19216 | ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) | | |||
| 19217 | ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) | | |||
| 19218 | ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7); | |||
| 19219 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1, | |||
| 19220 | DAG.getTargetConstant(VPERMILPMask, DL, MVT::i8)); | |||
| 19221 | } | |||
| 19222 | ||||
| 19223 | SmallVector<int, 4> RepeatedMask; | |||
| 19224 | if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) | |||
| 19225 | return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1, | |||
| 19226 | getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); | |||
| 19227 | } | |||
| 19228 | ||||
| 19229 | if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8f64, Mask, Zeroable, V1, | |||
| 19230 | V2, Subtarget, DAG)) | |||
| 19231 | return Shuf128; | |||
| 19232 | ||||
| 19233 | if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG)) | |||
| 19234 | return Unpck; | |||
| 19235 | ||||
| 19236 | // Check if the blend happens to exactly fit that of SHUFPD. | |||
| 19237 | if (SDValue Op = lowerShuffleWithSHUFPD(DL, MVT::v8f64, V1, V2, Mask, | |||
| 19238 | Zeroable, Subtarget, DAG)) | |||
| 19239 | return Op; | |||
| 19240 | ||||
| 19241 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2, | |||
| 19242 | DAG, Subtarget)) | |||
| 19243 | return V; | |||
| 19244 | ||||
| 19245 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask, | |||
| 19246 | Zeroable, Subtarget, DAG)) | |||
| 19247 | return Blend; | |||
| 19248 | ||||
| 19249 | return lowerShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, Subtarget, DAG); | |||
| 19250 | } | |||
| 19251 | ||||
| 19252 | /// Handle lowering of 16-lane 32-bit floating point shuffles. | |||
| 19253 | static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19254 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19255 | const X86Subtarget &Subtarget, | |||
| 19256 | SelectionDAG &DAG) { | |||
| 19257 | assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19257, __extension__ __PRETTY_FUNCTION__)); | |||
| 19258 | assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v16f32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19258, __extension__ __PRETTY_FUNCTION__)); | |||
| 19259 | assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19259, __extension__ __PRETTY_FUNCTION__)); | |||
| 19260 | ||||
| 19261 | // If the shuffle mask is repeated in each 128-bit lane, we have many more | |||
| 19262 | // options to efficiently lower the shuffle. | |||
| 19263 | SmallVector<int, 4> RepeatedMask; | |||
| 19264 | if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) { | |||
| 19265 | assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 && "Unexpected repeated mask size!") ? void (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19265, __extension__ __PRETTY_FUNCTION__)); | |||
| 19266 | ||||
| 19267 | // Use even/odd duplicate instructions for masks that match their pattern. | |||
| 19268 | if (isShuffleEquivalent(RepeatedMask, {0, 0, 2, 2}, V1, V2)) | |||
| 19269 | return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1); | |||
| 19270 | if (isShuffleEquivalent(RepeatedMask, {1, 1, 3, 3}, V1, V2)) | |||
| 19271 | return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1); | |||
| 19272 | ||||
| 19273 | if (V2.isUndef()) | |||
| 19274 | return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1, | |||
| 19275 | getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); | |||
| 19276 | ||||
| 19277 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 19278 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG)) | |||
| 19279 | return V; | |||
| 19280 | ||||
| 19281 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, | |||
| 19282 | Zeroable, Subtarget, DAG)) | |||
| 19283 | return Blend; | |||
| 19284 | ||||
| 19285 | // Otherwise, fall back to a SHUFPS sequence. | |||
| 19286 | return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); | |||
| 19287 | } | |||
| 19288 | ||||
| 19289 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask, | |||
| 19290 | Zeroable, Subtarget, DAG)) | |||
| 19291 | return Blend; | |||
| 19292 | ||||
| 19293 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 19294 | DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19295 | return DAG.getBitcast(MVT::v16f32, ZExt); | |||
| 19296 | ||||
| 19297 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 19298 | // results into the target lanes. | |||
| 19299 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 19300 | DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG)) | |||
| 19301 | return V; | |||
| 19302 | ||||
| 19303 | // If we have a single input shuffle with different shuffle patterns in the | |||
| 19304 | // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. | |||
| 19305 | if (V2.isUndef() && | |||
| 19306 | !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { | |||
| 19307 | SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); | |||
| 19308 | return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); | |||
| 19309 | } | |||
| 19310 | ||||
| 19311 | // If we have AVX512F support, we can use VEXPAND. | |||
| 19312 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, | |||
| 19313 | V1, V2, DAG, Subtarget)) | |||
| 19314 | return V; | |||
| 19315 | ||||
| 19316 | return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG); | |||
| 19317 | } | |||
| 19318 | ||||
| 19319 | /// Handle lowering of 8-lane 64-bit integer shuffles. | |||
| 19320 | static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19321 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19322 | const X86Subtarget &Subtarget, | |||
| 19323 | SelectionDAG &DAG) { | |||
| 19324 | assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19324, __extension__ __PRETTY_FUNCTION__)); | |||
| 19325 | assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v8i64 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19325, __extension__ __PRETTY_FUNCTION__)); | |||
| 19326 | assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!")(static_cast <bool> (Mask.size() == 8 && "Unexpected mask size for v8 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 8 && \"Unexpected mask size for v8 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19326, __extension__ __PRETTY_FUNCTION__)); | |||
| 19327 | ||||
| 19328 | // Try to use shift instructions if fast. | |||
| 19329 | if (Subtarget.preferLowerShuffleAsShift()) | |||
| 19330 | if (SDValue Shift = | |||
| 19331 | lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, | |||
| 19332 | Subtarget, DAG, /*BitwiseOnly*/ true)) | |||
| 19333 | return Shift; | |||
| 19334 | ||||
| 19335 | if (V2.isUndef()) { | |||
| 19336 | // When the shuffle is mirrored between the 128-bit lanes of the unit, we | |||
| 19337 | // can use lower latency instructions that will operate on all four | |||
| 19338 | // 128-bit lanes. | |||
| 19339 | SmallVector<int, 2> Repeated128Mask; | |||
| 19340 | if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { | |||
| 19341 | SmallVector<int, 4> PSHUFDMask; | |||
| 19342 | narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask); | |||
| 19343 | return DAG.getBitcast( | |||
| 19344 | MVT::v8i64, | |||
| 19345 | DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, | |||
| 19346 | DAG.getBitcast(MVT::v16i32, V1), | |||
| 19347 | getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG))); | |||
| 19348 | } | |||
| 19349 | ||||
| 19350 | SmallVector<int, 4> Repeated256Mask; | |||
| 19351 | if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask)) | |||
| 19352 | return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1, | |||
| 19353 | getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); | |||
| 19354 | } | |||
| 19355 | ||||
| 19356 | if (SDValue Shuf128 = lowerV4X128Shuffle(DL, MVT::v8i64, Mask, Zeroable, V1, | |||
| 19357 | V2, Subtarget, DAG)) | |||
| 19358 | return Shuf128; | |||
| 19359 | ||||
| 19360 | // Try to use shift instructions. | |||
| 19361 | if (SDValue Shift = | |||
| 19362 | lowerShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, | |||
| 19363 | DAG, /*BitwiseOnly*/ false)) | |||
| 19364 | return Shift; | |||
| 19365 | ||||
| 19366 | // Try to use VALIGN. | |||
| 19367 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask, | |||
| 19368 | Subtarget, DAG)) | |||
| 19369 | return Rotate; | |||
| 19370 | ||||
| 19371 | // Try to use PALIGNR. | |||
| 19372 | if (Subtarget.hasBWI()) | |||
| 19373 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v8i64, V1, V2, Mask, | |||
| 19374 | Subtarget, DAG)) | |||
| 19375 | return Rotate; | |||
| 19376 | ||||
| 19377 | if (SDValue Unpck = lowerShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG)) | |||
| 19378 | return Unpck; | |||
| 19379 | ||||
| 19380 | // If we have AVX512F support, we can use VEXPAND. | |||
| 19381 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2, | |||
| 19382 | DAG, Subtarget)) | |||
| 19383 | return V; | |||
| 19384 | ||||
| 19385 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask, | |||
| 19386 | Zeroable, Subtarget, DAG)) | |||
| 19387 | return Blend; | |||
| 19388 | ||||
| 19389 | return lowerShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, Subtarget, DAG); | |||
| 19390 | } | |||
| 19391 | ||||
| 19392 | /// Handle lowering of 16-lane 32-bit integer shuffles. | |||
| 19393 | static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19394 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19395 | const X86Subtarget &Subtarget, | |||
| 19396 | SelectionDAG &DAG) { | |||
| 19397 | assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19397, __extension__ __PRETTY_FUNCTION__)); | |||
| 19398 | assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v16i32 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19398, __extension__ __PRETTY_FUNCTION__)); | |||
| 19399 | assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!")(static_cast <bool> (Mask.size() == 16 && "Unexpected mask size for v16 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 16 && \"Unexpected mask size for v16 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19399, __extension__ __PRETTY_FUNCTION__)); | |||
| 19400 | ||||
| 19401 | int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; }); | |||
| 19402 | ||||
| 19403 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 19404 | // than any alternative. It also allows us to fold memory operands into the | |||
| 19405 | // shuffle in many cases. | |||
| 19406 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 19407 | DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19408 | return ZExt; | |||
| 19409 | ||||
| 19410 | // Try to use shift instructions if fast. | |||
| 19411 | if (Subtarget.preferLowerShuffleAsShift()) { | |||
| 19412 | if (SDValue Shift = | |||
| 19413 | lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, | |||
| 19414 | Subtarget, DAG, /*BitwiseOnly*/ true)) | |||
| 19415 | return Shift; | |||
| 19416 | if (NumV2Elements == 0) | |||
| 19417 | if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, | |||
| 19418 | Subtarget, DAG)) | |||
| 19419 | return Rotate; | |||
| 19420 | } | |||
| 19421 | ||||
| 19422 | // If the shuffle mask is repeated in each 128-bit lane we can use more | |||
| 19423 | // efficient instructions that mirror the shuffles across the four 128-bit | |||
| 19424 | // lanes. | |||
| 19425 | SmallVector<int, 4> RepeatedMask; | |||
| 19426 | bool Is128BitLaneRepeatedShuffle = | |||
| 19427 | is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask); | |||
| 19428 | if (Is128BitLaneRepeatedShuffle) { | |||
| 19429 | assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!")(static_cast <bool> (RepeatedMask.size() == 4 && "Unexpected repeated mask size!") ? void (0) : __assert_fail ("RepeatedMask.size() == 4 && \"Unexpected repeated mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19429, __extension__ __PRETTY_FUNCTION__)); | |||
| 19430 | if (V2.isUndef()) | |||
| 19431 | return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1, | |||
| 19432 | getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG)); | |||
| 19433 | ||||
| 19434 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 19435 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG)) | |||
| 19436 | return V; | |||
| 19437 | } | |||
| 19438 | ||||
| 19439 | // Try to use shift instructions. | |||
| 19440 | if (SDValue Shift = | |||
| 19441 | lowerShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask, Zeroable, | |||
| 19442 | Subtarget, DAG, /*BitwiseOnly*/ false)) | |||
| 19443 | return Shift; | |||
| 19444 | ||||
| 19445 | if (!Subtarget.preferLowerShuffleAsShift() && NumV2Elements != 0) | |||
| 19446 | if (SDValue Rotate = | |||
| 19447 | lowerShuffleAsBitRotate(DL, MVT::v16i32, V1, Mask, Subtarget, DAG)) | |||
| 19448 | return Rotate; | |||
| 19449 | ||||
| 19450 | // Try to use VALIGN. | |||
| 19451 | if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask, | |||
| 19452 | Subtarget, DAG)) | |||
| 19453 | return Rotate; | |||
| 19454 | ||||
| 19455 | // Try to use byte rotation instructions. | |||
| 19456 | if (Subtarget.hasBWI()) | |||
| 19457 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v16i32, V1, V2, Mask, | |||
| 19458 | Subtarget, DAG)) | |||
| 19459 | return Rotate; | |||
| 19460 | ||||
| 19461 | // Assume that a single SHUFPS is faster than using a permv shuffle. | |||
| 19462 | // If some CPU is harmed by the domain switch, we can fix it in a later pass. | |||
| 19463 | if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) { | |||
| 19464 | SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1); | |||
| 19465 | SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2); | |||
| 19466 | SDValue ShufPS = lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, | |||
| 19467 | CastV1, CastV2, DAG); | |||
| 19468 | return DAG.getBitcast(MVT::v16i32, ShufPS); | |||
| 19469 | } | |||
| 19470 | ||||
| 19471 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 19472 | // results into the target lanes. | |||
| 19473 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 19474 | DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG)) | |||
| 19475 | return V; | |||
| 19476 | ||||
| 19477 | // If we have AVX512F support, we can use VEXPAND. | |||
| 19478 | if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2, | |||
| 19479 | DAG, Subtarget)) | |||
| 19480 | return V; | |||
| 19481 | ||||
| 19482 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask, | |||
| 19483 | Zeroable, Subtarget, DAG)) | |||
| 19484 | return Blend; | |||
| 19485 | ||||
| 19486 | return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, Subtarget, DAG); | |||
| 19487 | } | |||
| 19488 | ||||
| 19489 | /// Handle lowering of 32-lane 16-bit integer shuffles. | |||
| 19490 | static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19491 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19492 | const X86Subtarget &Subtarget, | |||
| 19493 | SelectionDAG &DAG) { | |||
| 19494 | assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19494, __extension__ __PRETTY_FUNCTION__)); | |||
| 19495 | assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v32i16 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19495, __extension__ __PRETTY_FUNCTION__)); | |||
| 19496 | assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!")(static_cast <bool> (Mask.size() == 32 && "Unexpected mask size for v32 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 32 && \"Unexpected mask size for v32 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19496, __extension__ __PRETTY_FUNCTION__)); | |||
| 19497 | assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v32i16 with AVX-512-BWI!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19497, __extension__ __PRETTY_FUNCTION__)); | |||
| 19498 | ||||
| 19499 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 19500 | // than any alternative. It also allows us to fold memory operands into the | |||
| 19501 | // shuffle in many cases. | |||
| 19502 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 19503 | DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19504 | return ZExt; | |||
| 19505 | ||||
| 19506 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 19507 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG)) | |||
| 19508 | return V; | |||
| 19509 | ||||
| 19510 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 19511 | if (SDValue V = | |||
| 19512 | lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget)) | |||
| 19513 | return V; | |||
| 19514 | ||||
| 19515 | // Try to use shift instructions. | |||
| 19516 | if (SDValue Shift = | |||
| 19517 | lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask, Zeroable, | |||
| 19518 | Subtarget, DAG, /*BitwiseOnly*/ false)) | |||
| 19519 | return Shift; | |||
| 19520 | ||||
| 19521 | // Try to use byte rotation instructions. | |||
| 19522 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v32i16, V1, V2, Mask, | |||
| 19523 | Subtarget, DAG)) | |||
| 19524 | return Rotate; | |||
| 19525 | ||||
| 19526 | if (V2.isUndef()) { | |||
| 19527 | // Try to use bit rotation instructions. | |||
| 19528 | if (SDValue Rotate = | |||
| 19529 | lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG)) | |||
| 19530 | return Rotate; | |||
| 19531 | ||||
| 19532 | SmallVector<int, 8> RepeatedMask; | |||
| 19533 | if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) { | |||
| 19534 | // As this is a single-input shuffle, the repeated mask should be | |||
| 19535 | // a strictly valid v8i16 mask that we can pass through to the v8i16 | |||
| 19536 | // lowering to handle even the v32 case. | |||
| 19537 | return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1, | |||
| 19538 | RepeatedMask, Subtarget, DAG); | |||
| 19539 | } | |||
| 19540 | } | |||
| 19541 | ||||
| 19542 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask, | |||
| 19543 | Zeroable, Subtarget, DAG)) | |||
| 19544 | return Blend; | |||
| 19545 | ||||
| 19546 | if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2, | |||
| 19547 | Zeroable, Subtarget, DAG)) | |||
| 19548 | return PSHUFB; | |||
| 19549 | ||||
| 19550 | return lowerShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, Subtarget, DAG); | |||
| 19551 | } | |||
| 19552 | ||||
| 19553 | /// Handle lowering of 64-lane 8-bit integer shuffles. | |||
| 19554 | static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19555 | const APInt &Zeroable, SDValue V1, SDValue V2, | |||
| 19556 | const X86Subtarget &Subtarget, | |||
| 19557 | SelectionDAG &DAG) { | |||
| 19558 | assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V1.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19558, __extension__ __PRETTY_FUNCTION__)); | |||
| 19559 | assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!")(static_cast <bool> (V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!") ? void (0) : __assert_fail ( "V2.getSimpleValueType() == MVT::v64i8 && \"Bad operand type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19559, __extension__ __PRETTY_FUNCTION__)); | |||
| 19560 | assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!")(static_cast <bool> (Mask.size() == 64 && "Unexpected mask size for v64 shuffle!" ) ? void (0) : __assert_fail ("Mask.size() == 64 && \"Unexpected mask size for v64 shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19560, __extension__ __PRETTY_FUNCTION__)); | |||
| 19561 | assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!")(static_cast <bool> (Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"We can only lower v64i8 with AVX-512-BWI!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19561, __extension__ __PRETTY_FUNCTION__)); | |||
| 19562 | ||||
| 19563 | // Whenever we can lower this as a zext, that instruction is strictly faster | |||
| 19564 | // than any alternative. It also allows us to fold memory operands into the | |||
| 19565 | // shuffle in many cases. | |||
| 19566 | if (SDValue ZExt = lowerShuffleAsZeroOrAnyExtend( | |||
| 19567 | DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19568 | return ZExt; | |||
| 19569 | ||||
| 19570 | // Use dedicated unpack instructions for masks that match their pattern. | |||
| 19571 | if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG)) | |||
| 19572 | return V; | |||
| 19573 | ||||
| 19574 | // Use dedicated pack instructions for masks that match their pattern. | |||
| 19575 | if (SDValue V = lowerShuffleWithPACK(DL, MVT::v64i8, Mask, V1, V2, DAG, | |||
| 19576 | Subtarget)) | |||
| 19577 | return V; | |||
| 19578 | ||||
| 19579 | // Try to use shift instructions. | |||
| 19580 | if (SDValue Shift = | |||
| 19581 | lowerShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, | |||
| 19582 | DAG, /*BitwiseOnly*/ false)) | |||
| 19583 | return Shift; | |||
| 19584 | ||||
| 19585 | // Try to use byte rotation instructions. | |||
| 19586 | if (SDValue Rotate = lowerShuffleAsByteRotate(DL, MVT::v64i8, V1, V2, Mask, | |||
| 19587 | Subtarget, DAG)) | |||
| 19588 | return Rotate; | |||
| 19589 | ||||
| 19590 | // Try to use bit rotation instructions. | |||
| 19591 | if (V2.isUndef()) | |||
| 19592 | if (SDValue Rotate = | |||
| 19593 | lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG)) | |||
| 19594 | return Rotate; | |||
| 19595 | ||||
| 19596 | // Lower as AND if possible. | |||
| 19597 | if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask, | |||
| 19598 | Zeroable, Subtarget, DAG)) | |||
| 19599 | return Masked; | |||
| 19600 | ||||
| 19601 | if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2, | |||
| 19602 | Zeroable, Subtarget, DAG)) | |||
| 19603 | return PSHUFB; | |||
| 19604 | ||||
| 19605 | // Try to create an in-lane repeating shuffle mask and then shuffle the | |||
| 19606 | // results into the target lanes. | |||
| 19607 | if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute( | |||
| 19608 | DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 19609 | return V; | |||
| 19610 | ||||
| 19611 | if (SDValue Result = lowerShuffleAsLanePermuteAndPermute( | |||
| 19612 | DL, MVT::v64i8, V1, V2, Mask, DAG, Subtarget)) | |||
| 19613 | return Result; | |||
| 19614 | ||||
| 19615 | if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask, | |||
| 19616 | Zeroable, Subtarget, DAG)) | |||
| 19617 | return Blend; | |||
| 19618 | ||||
| 19619 | if (!is128BitLaneCrossingShuffleMask(MVT::v64i8, Mask)) { | |||
| 19620 | // Use PALIGNR+Permute if possible - permute might become PSHUFB but the | |||
| 19621 | // PALIGNR will be cheaper than the second PSHUFB+OR. | |||
| 19622 | if (SDValue V = lowerShuffleAsByteRotateAndPermute(DL, MVT::v64i8, V1, V2, | |||
| 19623 | Mask, Subtarget, DAG)) | |||
| 19624 | return V; | |||
| 19625 | ||||
| 19626 | // If we can't directly blend but can use PSHUFB, that will be better as it | |||
| 19627 | // can both shuffle and set up the inefficient blend. | |||
| 19628 | bool V1InUse, V2InUse; | |||
| 19629 | return lowerShuffleAsBlendOfPSHUFBs(DL, MVT::v64i8, V1, V2, Mask, Zeroable, | |||
| 19630 | DAG, V1InUse, V2InUse); | |||
| 19631 | } | |||
| 19632 | ||||
| 19633 | // Try to simplify this by merging 128-bit lanes to enable a lane-based | |||
| 19634 | // shuffle. | |||
| 19635 | if (!V2.isUndef()) | |||
| 19636 | if (SDValue Result = lowerShuffleAsLanePermuteAndRepeatedMask( | |||
| 19637 | DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG)) | |||
| 19638 | return Result; | |||
| 19639 | ||||
| 19640 | // VBMI can use VPERMV/VPERMV3 byte shuffles. | |||
| 19641 | if (Subtarget.hasVBMI()) | |||
| 19642 | return lowerShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, Subtarget, DAG); | |||
| 19643 | ||||
| 19644 | return splitAndLowerShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG, /*SimpleOnly*/ false); | |||
| 19645 | } | |||
| 19646 | ||||
| 19647 | /// High-level routine to lower various 512-bit x86 vector shuffles. | |||
| 19648 | /// | |||
| 19649 | /// This routine either breaks down the specific type of a 512-bit x86 vector | |||
| 19650 | /// shuffle or splits it into two 256-bit shuffles and fuses the results back | |||
| 19651 | /// together based on the available instructions. | |||
| 19652 | static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19653 | MVT VT, SDValue V1, SDValue V2, | |||
| 19654 | const APInt &Zeroable, | |||
| 19655 | const X86Subtarget &Subtarget, | |||
| 19656 | SelectionDAG &DAG) { | |||
| 19657 | assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__ __PRETTY_FUNCTION__)) | |||
| 19658 | "Cannot lower 512-bit vectors w/ basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/ basic ISA!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/ basic ISA!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19658, __extension__ __PRETTY_FUNCTION__)); | |||
| 19659 | ||||
| 19660 | // If we have a single input to the zero element, insert that into V1 if we | |||
| 19661 | // can do so cheaply. | |||
| 19662 | int NumElts = Mask.size(); | |||
| 19663 | int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; }); | |||
| 19664 | ||||
| 19665 | if (NumV2Elements == 1 && Mask[0] >= NumElts) | |||
| 19666 | if (SDValue Insertion = lowerShuffleAsElementInsertion( | |||
| 19667 | DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) | |||
| 19668 | return Insertion; | |||
| 19669 | ||||
| 19670 | // Handle special cases where the lower or upper half is UNDEF. | |||
| 19671 | if (SDValue V = | |||
| 19672 | lowerShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) | |||
| 19673 | return V; | |||
| 19674 | ||||
| 19675 | // Check for being able to broadcast a single element. | |||
| 19676 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, Mask, | |||
| 19677 | Subtarget, DAG)) | |||
| 19678 | return Broadcast; | |||
| 19679 | ||||
| 19680 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) { | |||
| 19681 | // Try using bit ops for masking and blending before falling back to | |||
| 19682 | // splitting. | |||
| 19683 | if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, | |||
| 19684 | Subtarget, DAG)) | |||
| 19685 | return V; | |||
| 19686 | if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG)) | |||
| 19687 | return V; | |||
| 19688 | ||||
| 19689 | return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG, /*SimpleOnly*/ false); | |||
| 19690 | } | |||
| 19691 | ||||
| 19692 | if (VT == MVT::v32f16) { | |||
| 19693 | V1 = DAG.getBitcast(MVT::v32i16, V1); | |||
| 19694 | V2 = DAG.getBitcast(MVT::v32i16, V2); | |||
| 19695 | return DAG.getBitcast(MVT::v32f16, | |||
| 19696 | DAG.getVectorShuffle(MVT::v32i16, DL, V1, V2, Mask)); | |||
| 19697 | } | |||
| 19698 | ||||
| 19699 | // Dispatch to each element type for lowering. If we don't have support for | |||
| 19700 | // specific element type shuffles at 512 bits, immediately split them and | |||
| 19701 | // lower them. Each lowering routine of a given type is allowed to assume that | |||
| 19702 | // the requisite ISA extensions for that element type are available. | |||
| 19703 | switch (VT.SimpleTy) { | |||
| 19704 | case MVT::v8f64: | |||
| 19705 | return lowerV8F64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19706 | case MVT::v16f32: | |||
| 19707 | return lowerV16F32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19708 | case MVT::v8i64: | |||
| 19709 | return lowerV8I64Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19710 | case MVT::v16i32: | |||
| 19711 | return lowerV16I32Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19712 | case MVT::v32i16: | |||
| 19713 | return lowerV32I16Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19714 | case MVT::v64i8: | |||
| 19715 | return lowerV64I8Shuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG); | |||
| 19716 | ||||
| 19717 | default: | |||
| 19718 | llvm_unreachable("Not a valid 512-bit x86 vector type!")::llvm::llvm_unreachable_internal("Not a valid 512-bit x86 vector type!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19718); | |||
| 19719 | } | |||
| 19720 | } | |||
| 19721 | ||||
| 19722 | static SDValue lower1BitShuffleAsKSHIFTR(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19723 | MVT VT, SDValue V1, SDValue V2, | |||
| 19724 | const X86Subtarget &Subtarget, | |||
| 19725 | SelectionDAG &DAG) { | |||
| 19726 | // Shuffle should be unary. | |||
| 19727 | if (!V2.isUndef()) | |||
| 19728 | return SDValue(); | |||
| 19729 | ||||
| 19730 | int ShiftAmt = -1; | |||
| 19731 | int NumElts = Mask.size(); | |||
| 19732 | for (int i = 0; i != NumElts; ++i) { | |||
| 19733 | int M = Mask[i]; | |||
| 19734 | assert((M == SM_SentinelUndef || (0 <= M && M < NumElts)) &&(static_cast <bool> ((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && "Unexpected mask index." ) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__ __PRETTY_FUNCTION__)) | |||
| 19735 | "Unexpected mask index.")(static_cast <bool> ((M == SM_SentinelUndef || (0 <= M && M < NumElts)) && "Unexpected mask index." ) ? void (0) : __assert_fail ("(M == SM_SentinelUndef || (0 <= M && M < NumElts)) && \"Unexpected mask index.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19735, __extension__ __PRETTY_FUNCTION__)); | |||
| 19736 | if (M < 0) | |||
| 19737 | continue; | |||
| 19738 | ||||
| 19739 | // The first non-undef element determines our shift amount. | |||
| 19740 | if (ShiftAmt < 0) { | |||
| 19741 | ShiftAmt = M - i; | |||
| 19742 | // Need to be shifting right. | |||
| 19743 | if (ShiftAmt <= 0) | |||
| 19744 | return SDValue(); | |||
| 19745 | } | |||
| 19746 | // All non-undef elements must shift by the same amount. | |||
| 19747 | if (ShiftAmt != M - i) | |||
| 19748 | return SDValue(); | |||
| 19749 | } | |||
| 19750 | assert(ShiftAmt >= 0 && "All undef?")(static_cast <bool> (ShiftAmt >= 0 && "All undef?" ) ? void (0) : __assert_fail ("ShiftAmt >= 0 && \"All undef?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19750, __extension__ __PRETTY_FUNCTION__)); | |||
| 19751 | ||||
| 19752 | // Great we found a shift right. | |||
| 19753 | MVT WideVT = VT; | |||
| 19754 | if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) | |||
| 19755 | WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 19756 | SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, | |||
| 19757 | DAG.getUNDEF(WideVT), V1, | |||
| 19758 | DAG.getIntPtrConstant(0, DL)); | |||
| 19759 | Res = DAG.getNode(X86ISD::KSHIFTR, DL, WideVT, Res, | |||
| 19760 | DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); | |||
| 19761 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 19762 | DAG.getIntPtrConstant(0, DL)); | |||
| 19763 | } | |||
| 19764 | ||||
| 19765 | // Determine if this shuffle can be implemented with a KSHIFT instruction. | |||
| 19766 | // Returns the shift amount if possible or -1 if not. This is a simplified | |||
| 19767 | // version of matchShuffleAsShift. | |||
| 19768 | static int match1BitShuffleAsKSHIFT(unsigned &Opcode, ArrayRef<int> Mask, | |||
| 19769 | int MaskOffset, const APInt &Zeroable) { | |||
| 19770 | int Size = Mask.size(); | |||
| 19771 | ||||
| 19772 | auto CheckZeros = [&](int Shift, bool Left) { | |||
| 19773 | for (int j = 0; j < Shift; ++j) | |||
| 19774 | if (!Zeroable[j + (Left ? 0 : (Size - Shift))]) | |||
| 19775 | return false; | |||
| 19776 | ||||
| 19777 | return true; | |||
| 19778 | }; | |||
| 19779 | ||||
| 19780 | auto MatchShift = [&](int Shift, bool Left) { | |||
| 19781 | unsigned Pos = Left ? Shift : 0; | |||
| 19782 | unsigned Low = Left ? 0 : Shift; | |||
| 19783 | unsigned Len = Size - Shift; | |||
| 19784 | return isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset); | |||
| 19785 | }; | |||
| 19786 | ||||
| 19787 | for (int Shift = 1; Shift != Size; ++Shift) | |||
| 19788 | for (bool Left : {true, false}) | |||
| 19789 | if (CheckZeros(Shift, Left) && MatchShift(Shift, Left)) { | |||
| 19790 | Opcode = Left ? X86ISD::KSHIFTL : X86ISD::KSHIFTR; | |||
| 19791 | return Shift; | |||
| 19792 | } | |||
| 19793 | ||||
| 19794 | return -1; | |||
| 19795 | } | |||
| 19796 | ||||
| 19797 | ||||
| 19798 | // Lower vXi1 vector shuffles. | |||
| 19799 | // There is no a dedicated instruction on AVX-512 that shuffles the masks. | |||
| 19800 | // The only way to shuffle bits is to sign-extend the mask vector to SIMD | |||
| 19801 | // vector, shuffle and then truncate it back. | |||
| 19802 | static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, | |||
| 19803 | MVT VT, SDValue V1, SDValue V2, | |||
| 19804 | const APInt &Zeroable, | |||
| 19805 | const X86Subtarget &Subtarget, | |||
| 19806 | SelectionDAG &DAG) { | |||
| 19807 | assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__ __PRETTY_FUNCTION__)) | |||
| 19808 | "Cannot lower 512-bit vectors w/o basic ISA!")(static_cast <bool> (Subtarget.hasAVX512() && "Cannot lower 512-bit vectors w/o basic ISA!" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"Cannot lower 512-bit vectors w/o basic ISA!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19808, __extension__ __PRETTY_FUNCTION__)); | |||
| 19809 | ||||
| 19810 | int NumElts = Mask.size(); | |||
| 19811 | ||||
| 19812 | // Try to recognize shuffles that are just padding a subvector with zeros. | |||
| 19813 | int SubvecElts = 0; | |||
| 19814 | int Src = -1; | |||
| 19815 | for (int i = 0; i != NumElts; ++i) { | |||
| 19816 | if (Mask[i] >= 0) { | |||
| 19817 | // Grab the source from the first valid mask. All subsequent elements need | |||
| 19818 | // to use this same source. | |||
| 19819 | if (Src < 0) | |||
| 19820 | Src = Mask[i] / NumElts; | |||
| 19821 | if (Src != (Mask[i] / NumElts) || (Mask[i] % NumElts) != i) | |||
| 19822 | break; | |||
| 19823 | } | |||
| 19824 | ||||
| 19825 | ++SubvecElts; | |||
| 19826 | } | |||
| 19827 | assert(SubvecElts != NumElts && "Identity shuffle?")(static_cast <bool> (SubvecElts != NumElts && "Identity shuffle?" ) ? void (0) : __assert_fail ("SubvecElts != NumElts && \"Identity shuffle?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19827, __extension__ __PRETTY_FUNCTION__)); | |||
| 19828 | ||||
| 19829 | // Clip to a power 2. | |||
| 19830 | SubvecElts = llvm::bit_floor<uint32_t>(SubvecElts); | |||
| 19831 | ||||
| 19832 | // Make sure the number of zeroable bits in the top at least covers the bits | |||
| 19833 | // not covered by the subvector. | |||
| 19834 | if ((int)Zeroable.countl_one() >= (NumElts - SubvecElts)) { | |||
| 19835 | assert(Src >= 0 && "Expected a source!")(static_cast <bool> (Src >= 0 && "Expected a source!" ) ? void (0) : __assert_fail ("Src >= 0 && \"Expected a source!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19835, __extension__ __PRETTY_FUNCTION__)); | |||
| 19836 | MVT ExtractVT = MVT::getVectorVT(MVT::i1, SubvecElts); | |||
| 19837 | SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtractVT, | |||
| 19838 | Src == 0 ? V1 : V2, | |||
| 19839 | DAG.getIntPtrConstant(0, DL)); | |||
| 19840 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, | |||
| 19841 | DAG.getConstant(0, DL, VT), | |||
| 19842 | Extract, DAG.getIntPtrConstant(0, DL)); | |||
| 19843 | } | |||
| 19844 | ||||
| 19845 | // Try a simple shift right with undef elements. Later we'll try with zeros. | |||
| 19846 | if (SDValue Shift = lower1BitShuffleAsKSHIFTR(DL, Mask, VT, V1, V2, Subtarget, | |||
| 19847 | DAG)) | |||
| 19848 | return Shift; | |||
| 19849 | ||||
| 19850 | // Try to match KSHIFTs. | |||
| 19851 | unsigned Offset = 0; | |||
| 19852 | for (SDValue V : { V1, V2 }) { | |||
| 19853 | unsigned Opcode; | |||
| 19854 | int ShiftAmt = match1BitShuffleAsKSHIFT(Opcode, Mask, Offset, Zeroable); | |||
| 19855 | if (ShiftAmt >= 0) { | |||
| 19856 | MVT WideVT = VT; | |||
| 19857 | if ((!Subtarget.hasDQI() && NumElts == 8) || NumElts < 8) | |||
| 19858 | WideVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 19859 | SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, | |||
| 19860 | DAG.getUNDEF(WideVT), V, | |||
| 19861 | DAG.getIntPtrConstant(0, DL)); | |||
| 19862 | // Widened right shifts need two shifts to ensure we shift in zeroes. | |||
| 19863 | if (Opcode == X86ISD::KSHIFTR && WideVT != VT) { | |||
| 19864 | int WideElts = WideVT.getVectorNumElements(); | |||
| 19865 | // Shift left to put the original vector in the MSBs of the new size. | |||
| 19866 | Res = DAG.getNode(X86ISD::KSHIFTL, DL, WideVT, Res, | |||
| 19867 | DAG.getTargetConstant(WideElts - NumElts, DL, MVT::i8)); | |||
| 19868 | // Increase the shift amount to account for the left shift. | |||
| 19869 | ShiftAmt += WideElts - NumElts; | |||
| 19870 | } | |||
| 19871 | ||||
| 19872 | Res = DAG.getNode(Opcode, DL, WideVT, Res, | |||
| 19873 | DAG.getTargetConstant(ShiftAmt, DL, MVT::i8)); | |||
| 19874 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 19875 | DAG.getIntPtrConstant(0, DL)); | |||
| 19876 | } | |||
| 19877 | Offset += NumElts; // Increment for next iteration. | |||
| 19878 | } | |||
| 19879 | ||||
| 19880 | // If we're broadcasting a SETCC result, try to broadcast the ops instead. | |||
| 19881 | // TODO: What other unary shuffles would benefit from this? | |||
| 19882 | if (isBroadcastShuffleMask(Mask) && V1.getOpcode() == ISD::SETCC && | |||
| 19883 | V1->hasOneUse()) { | |||
| 19884 | SDValue Op0 = V1.getOperand(0); | |||
| 19885 | SDValue Op1 = V1.getOperand(1); | |||
| 19886 | ISD::CondCode CC = cast<CondCodeSDNode>(V1.getOperand(2))->get(); | |||
| 19887 | EVT OpVT = Op0.getValueType(); | |||
| 19888 | return DAG.getSetCC( | |||
| 19889 | DL, VT, DAG.getVectorShuffle(OpVT, DL, Op0, DAG.getUNDEF(OpVT), Mask), | |||
| 19890 | DAG.getVectorShuffle(OpVT, DL, Op1, DAG.getUNDEF(OpVT), Mask), CC); | |||
| 19891 | } | |||
| 19892 | ||||
| 19893 | MVT ExtVT; | |||
| 19894 | switch (VT.SimpleTy) { | |||
| 19895 | default: | |||
| 19896 | llvm_unreachable("Expected a vector of i1 elements")::llvm::llvm_unreachable_internal("Expected a vector of i1 elements" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19896); | |||
| 19897 | case MVT::v2i1: | |||
| 19898 | ExtVT = MVT::v2i64; | |||
| 19899 | break; | |||
| 19900 | case MVT::v4i1: | |||
| 19901 | ExtVT = MVT::v4i32; | |||
| 19902 | break; | |||
| 19903 | case MVT::v8i1: | |||
| 19904 | // Take 512-bit type, more shuffles on KNL. If we have VLX use a 256-bit | |||
| 19905 | // shuffle. | |||
| 19906 | ExtVT = Subtarget.hasVLX() ? MVT::v8i32 : MVT::v8i64; | |||
| 19907 | break; | |||
| 19908 | case MVT::v16i1: | |||
| 19909 | // Take 512-bit type, unless we are avoiding 512-bit types and have the | |||
| 19910 | // 256-bit operation available. | |||
| 19911 | ExtVT = Subtarget.canExtendTo512DQ() ? MVT::v16i32 : MVT::v16i16; | |||
| 19912 | break; | |||
| 19913 | case MVT::v32i1: | |||
| 19914 | // Take 512-bit type, unless we are avoiding 512-bit types and have the | |||
| 19915 | // 256-bit operation available. | |||
| 19916 | assert(Subtarget.hasBWI() && "Expected AVX512BW support")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW support" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW support\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19916, __extension__ __PRETTY_FUNCTION__)); | |||
| 19917 | ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8; | |||
| 19918 | break; | |||
| 19919 | case MVT::v64i1: | |||
| 19920 | // Fall back to scalarization. FIXME: We can do better if the shuffle | |||
| 19921 | // can be partitioned cleanly. | |||
| 19922 | if (!Subtarget.useBWIRegs()) | |||
| 19923 | return SDValue(); | |||
| 19924 | ExtVT = MVT::v64i8; | |||
| 19925 | break; | |||
| 19926 | } | |||
| 19927 | ||||
| 19928 | V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1); | |||
| 19929 | V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2); | |||
| 19930 | ||||
| 19931 | SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask); | |||
| 19932 | // i1 was sign extended we can use X86ISD::CVT2MASK. | |||
| 19933 | int NumElems = VT.getVectorNumElements(); | |||
| 19934 | if ((Subtarget.hasBWI() && (NumElems >= 32)) || | |||
| 19935 | (Subtarget.hasDQI() && (NumElems < 32))) | |||
| 19936 | return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, ExtVT), | |||
| 19937 | Shuffle, ISD::SETGT); | |||
| 19938 | ||||
| 19939 | return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle); | |||
| 19940 | } | |||
| 19941 | ||||
| 19942 | /// Helper function that returns true if the shuffle mask should be | |||
| 19943 | /// commuted to improve canonicalization. | |||
| 19944 | static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) { | |||
| 19945 | int NumElements = Mask.size(); | |||
| 19946 | ||||
| 19947 | int NumV1Elements = 0, NumV2Elements = 0; | |||
| 19948 | for (int M : Mask) | |||
| 19949 | if (M < 0) | |||
| 19950 | continue; | |||
| 19951 | else if (M < NumElements) | |||
| 19952 | ++NumV1Elements; | |||
| 19953 | else | |||
| 19954 | ++NumV2Elements; | |||
| 19955 | ||||
| 19956 | // Commute the shuffle as needed such that more elements come from V1 than | |||
| 19957 | // V2. This allows us to match the shuffle pattern strictly on how many | |||
| 19958 | // elements come from V1 without handling the symmetric cases. | |||
| 19959 | if (NumV2Elements > NumV1Elements) | |||
| 19960 | return true; | |||
| 19961 | ||||
| 19962 | assert(NumV1Elements > 0 && "No V1 indices")(static_cast <bool> (NumV1Elements > 0 && "No V1 indices" ) ? void (0) : __assert_fail ("NumV1Elements > 0 && \"No V1 indices\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 19962, __extension__ __PRETTY_FUNCTION__)); | |||
| 19963 | ||||
| 19964 | if (NumV2Elements == 0) | |||
| 19965 | return false; | |||
| 19966 | ||||
| 19967 | // When the number of V1 and V2 elements are the same, try to minimize the | |||
| 19968 | // number of uses of V2 in the low half of the vector. When that is tied, | |||
| 19969 | // ensure that the sum of indices for V1 is equal to or lower than the sum | |||
| 19970 | // indices for V2. When those are equal, try to ensure that the number of odd | |||
| 19971 | // indices for V1 is lower than the number of odd indices for V2. | |||
| 19972 | if (NumV1Elements == NumV2Elements) { | |||
| 19973 | int LowV1Elements = 0, LowV2Elements = 0; | |||
| 19974 | for (int M : Mask.slice(0, NumElements / 2)) | |||
| 19975 | if (M >= NumElements) | |||
| 19976 | ++LowV2Elements; | |||
| 19977 | else if (M >= 0) | |||
| 19978 | ++LowV1Elements; | |||
| 19979 | if (LowV2Elements > LowV1Elements) | |||
| 19980 | return true; | |||
| 19981 | if (LowV2Elements == LowV1Elements) { | |||
| 19982 | int SumV1Indices = 0, SumV2Indices = 0; | |||
| 19983 | for (int i = 0, Size = Mask.size(); i < Size; ++i) | |||
| 19984 | if (Mask[i] >= NumElements) | |||
| 19985 | SumV2Indices += i; | |||
| 19986 | else if (Mask[i] >= 0) | |||
| 19987 | SumV1Indices += i; | |||
| 19988 | if (SumV2Indices < SumV1Indices) | |||
| 19989 | return true; | |||
| 19990 | if (SumV2Indices == SumV1Indices) { | |||
| 19991 | int NumV1OddIndices = 0, NumV2OddIndices = 0; | |||
| 19992 | for (int i = 0, Size = Mask.size(); i < Size; ++i) | |||
| 19993 | if (Mask[i] >= NumElements) | |||
| 19994 | NumV2OddIndices += i % 2; | |||
| 19995 | else if (Mask[i] >= 0) | |||
| 19996 | NumV1OddIndices += i % 2; | |||
| 19997 | if (NumV2OddIndices < NumV1OddIndices) | |||
| 19998 | return true; | |||
| 19999 | } | |||
| 20000 | } | |||
| 20001 | } | |||
| 20002 | ||||
| 20003 | return false; | |||
| 20004 | } | |||
| 20005 | ||||
| 20006 | static bool canCombineAsMaskOperation(SDValue V1, SDValue V2, | |||
| 20007 | const X86Subtarget &Subtarget) { | |||
| 20008 | if (!Subtarget.hasAVX512()) | |||
| 20009 | return false; | |||
| 20010 | ||||
| 20011 | MVT VT = V1.getSimpleValueType().getScalarType(); | |||
| 20012 | if ((VT == MVT::i16 || VT == MVT::i8) && !Subtarget.hasBWI()) | |||
| 20013 | return false; | |||
| 20014 | ||||
| 20015 | // If vec width < 512, widen i8/i16 even with BWI as blendd/blendps/blendpd | |||
| 20016 | // are preferable to blendw/blendvb/masked-mov. | |||
| 20017 | if ((VT == MVT::i16 || VT == MVT::i8) && | |||
| 20018 | V1.getSimpleValueType().getSizeInBits() < 512) | |||
| 20019 | return false; | |||
| 20020 | ||||
| 20021 | auto HasMaskOperation = [&](SDValue V) { | |||
| 20022 | // TODO: Currently we only check limited opcode. We probably extend | |||
| 20023 | // it to all binary operation by checking TLI.isBinOp(). | |||
| 20024 | switch (V->getOpcode()) { | |||
| 20025 | default: | |||
| 20026 | return false; | |||
| 20027 | case ISD::ADD: | |||
| 20028 | case ISD::SUB: | |||
| 20029 | case ISD::AND: | |||
| 20030 | case ISD::XOR: | |||
| 20031 | case ISD::OR: | |||
| 20032 | case ISD::SMAX: | |||
| 20033 | case ISD::SMIN: | |||
| 20034 | case ISD::UMAX: | |||
| 20035 | case ISD::UMIN: | |||
| 20036 | case ISD::ABS: | |||
| 20037 | case ISD::SHL: | |||
| 20038 | case ISD::SRL: | |||
| 20039 | case ISD::SRA: | |||
| 20040 | case ISD::MUL: | |||
| 20041 | break; | |||
| 20042 | } | |||
| 20043 | if (!V->hasOneUse()) | |||
| 20044 | return false; | |||
| 20045 | ||||
| 20046 | return true; | |||
| 20047 | }; | |||
| 20048 | ||||
| 20049 | if (HasMaskOperation(V1) || HasMaskOperation(V2)) | |||
| 20050 | return true; | |||
| 20051 | ||||
| 20052 | return false; | |||
| 20053 | } | |||
| 20054 | ||||
| 20055 | // Forward declaration. | |||
| 20056 | static SDValue canonicalizeShuffleMaskWithHorizOp( | |||
| 20057 | MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask, | |||
| 20058 | unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, | |||
| 20059 | const X86Subtarget &Subtarget); | |||
| 20060 | ||||
| 20061 | /// Top-level lowering for x86 vector shuffles. | |||
| 20062 | /// | |||
| 20063 | /// This handles decomposition, canonicalization, and lowering of all x86 | |||
| 20064 | /// vector shuffles. Most of the specific lowering strategies are encapsulated | |||
| 20065 | /// above in helper routines. The canonicalization attempts to widen shuffles | |||
| 20066 | /// to involve fewer lanes of wider elements, consolidate symmetric patterns | |||
| 20067 | /// s.t. only one of the two inputs needs to be tested, etc. | |||
| 20068 | static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget, | |||
| 20069 | SelectionDAG &DAG) { | |||
| 20070 | ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op); | |||
| 20071 | ArrayRef<int> OrigMask = SVOp->getMask(); | |||
| 20072 | SDValue V1 = Op.getOperand(0); | |||
| 20073 | SDValue V2 = Op.getOperand(1); | |||
| 20074 | MVT VT = Op.getSimpleValueType(); | |||
| 20075 | int NumElements = VT.getVectorNumElements(); | |||
| 20076 | SDLoc DL(Op); | |||
| 20077 | bool Is1BitVector = (VT.getVectorElementType() == MVT::i1); | |||
| 20078 | ||||
| 20079 | assert((VT.getSizeInBits() != 64 || Is1BitVector) &&(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector ) && "Can't lower MMX shuffles") ? void (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__ __PRETTY_FUNCTION__)) | |||
| 20080 | "Can't lower MMX shuffles")(static_cast <bool> ((VT.getSizeInBits() != 64 || Is1BitVector ) && "Can't lower MMX shuffles") ? void (0) : __assert_fail ("(VT.getSizeInBits() != 64 || Is1BitVector) && \"Can't lower MMX shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20080, __extension__ __PRETTY_FUNCTION__)); | |||
| 20081 | ||||
| 20082 | bool V1IsUndef = V1.isUndef(); | |||
| 20083 | bool V2IsUndef = V2.isUndef(); | |||
| 20084 | if (V1IsUndef && V2IsUndef) | |||
| 20085 | return DAG.getUNDEF(VT); | |||
| 20086 | ||||
| 20087 | // When we create a shuffle node we put the UNDEF node to second operand, | |||
| 20088 | // but in some cases the first operand may be transformed to UNDEF. | |||
| 20089 | // In this case we should just commute the node. | |||
| 20090 | if (V1IsUndef) | |||
| 20091 | return DAG.getCommutedVectorShuffle(*SVOp); | |||
| 20092 | ||||
| 20093 | // Check for non-undef masks pointing at an undef vector and make the masks | |||
| 20094 | // undef as well. This makes it easier to match the shuffle based solely on | |||
| 20095 | // the mask. | |||
| 20096 | if (V2IsUndef && | |||
| 20097 | any_of(OrigMask, [NumElements](int M) { return M >= NumElements; })) { | |||
| 20098 | SmallVector<int, 8> NewMask(OrigMask); | |||
| 20099 | for (int &M : NewMask) | |||
| 20100 | if (M >= NumElements) | |||
| 20101 | M = -1; | |||
| 20102 | return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask); | |||
| 20103 | } | |||
| 20104 | ||||
| 20105 | // Check for illegal shuffle mask element index values. | |||
| 20106 | int MaskUpperLimit = OrigMask.size() * (V2IsUndef ? 1 : 2); | |||
| 20107 | (void)MaskUpperLimit; | |||
| 20108 | assert(llvm::all_of(OrigMask,(static_cast <bool> (llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__ __PRETTY_FUNCTION__)) | |||
| 20109 | [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&(static_cast <bool> (llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__ __PRETTY_FUNCTION__)) | |||
| 20110 | "Out of bounds shuffle index")(static_cast <bool> (llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && "Out of bounds shuffle index") ? void (0) : __assert_fail ("llvm::all_of(OrigMask, [&](int M) { return -1 <= M && M < MaskUpperLimit; }) && \"Out of bounds shuffle index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20110, __extension__ __PRETTY_FUNCTION__)); | |||
| 20111 | ||||
| 20112 | // We actually see shuffles that are entirely re-arrangements of a set of | |||
| 20113 | // zero inputs. This mostly happens while decomposing complex shuffles into | |||
| 20114 | // simple ones. Directly lower these as a buildvector of zeros. | |||
| 20115 | APInt KnownUndef, KnownZero; | |||
| 20116 | computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero); | |||
| 20117 | ||||
| 20118 | APInt Zeroable = KnownUndef | KnownZero; | |||
| 20119 | if (Zeroable.isAllOnes()) | |||
| 20120 | return getZeroVector(VT, Subtarget, DAG, DL); | |||
| 20121 | ||||
| 20122 | bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode()); | |||
| 20123 | ||||
| 20124 | // Try to collapse shuffles into using a vector type with fewer elements but | |||
| 20125 | // wider element types. We cap this to not form integers or floating point | |||
| 20126 | // elements wider than 64 bits. It does not seem beneficial to form i128 | |||
| 20127 | // integers to handle flipping the low and high halves of AVX 256-bit vectors. | |||
| 20128 | SmallVector<int, 16> WidenedMask; | |||
| 20129 | if (VT.getScalarSizeInBits() < 64 && !Is1BitVector && | |||
| 20130 | !canCombineAsMaskOperation(V1, V2, Subtarget) && | |||
| 20131 | canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) { | |||
| 20132 | // Shuffle mask widening should not interfere with a broadcast opportunity | |||
| 20133 | // by obfuscating the operands with bitcasts. | |||
| 20134 | // TODO: Avoid lowering directly from this top-level function: make this | |||
| 20135 | // a query (canLowerAsBroadcast) and defer lowering to the type-based calls. | |||
| 20136 | if (SDValue Broadcast = lowerShuffleAsBroadcast(DL, VT, V1, V2, OrigMask, | |||
| 20137 | Subtarget, DAG)) | |||
| 20138 | return Broadcast; | |||
| 20139 | ||||
| 20140 | MVT NewEltVT = VT.isFloatingPoint() | |||
| 20141 | ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2) | |||
| 20142 | : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2); | |||
| 20143 | int NewNumElts = NumElements / 2; | |||
| 20144 | MVT NewVT = MVT::getVectorVT(NewEltVT, NewNumElts); | |||
| 20145 | // Make sure that the new vector type is legal. For example, v2f64 isn't | |||
| 20146 | // legal on SSE1. | |||
| 20147 | if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) { | |||
| 20148 | if (V2IsZero) { | |||
| 20149 | // Modify the new Mask to take all zeros from the all-zero vector. | |||
| 20150 | // Choose indices that are blend-friendly. | |||
| 20151 | bool UsedZeroVector = false; | |||
| 20152 | assert(is_contained(WidenedMask, SM_SentinelZero) &&(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero ) && "V2's non-undef elements are used?!") ? void (0) : __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__ __PRETTY_FUNCTION__)) | |||
| 20153 | "V2's non-undef elements are used?!")(static_cast <bool> (is_contained(WidenedMask, SM_SentinelZero ) && "V2's non-undef elements are used?!") ? void (0) : __assert_fail ("is_contained(WidenedMask, SM_SentinelZero) && \"V2's non-undef elements are used?!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20153, __extension__ __PRETTY_FUNCTION__)); | |||
| 20154 | for (int i = 0; i != NewNumElts; ++i) | |||
| 20155 | if (WidenedMask[i] == SM_SentinelZero) { | |||
| 20156 | WidenedMask[i] = i + NewNumElts; | |||
| 20157 | UsedZeroVector = true; | |||
| 20158 | } | |||
| 20159 | // Ensure all elements of V2 are zero - isBuildVectorAllZeros permits | |||
| 20160 | // some elements to be undef. | |||
| 20161 | if (UsedZeroVector) | |||
| 20162 | V2 = getZeroVector(NewVT, Subtarget, DAG, DL); | |||
| 20163 | } | |||
| 20164 | V1 = DAG.getBitcast(NewVT, V1); | |||
| 20165 | V2 = DAG.getBitcast(NewVT, V2); | |||
| 20166 | return DAG.getBitcast( | |||
| 20167 | VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask)); | |||
| 20168 | } | |||
| 20169 | } | |||
| 20170 | ||||
| 20171 | SmallVector<SDValue> Ops = {V1, V2}; | |||
| 20172 | SmallVector<int> Mask(OrigMask); | |||
| 20173 | ||||
| 20174 | // Canonicalize the shuffle with any horizontal ops inputs. | |||
| 20175 | // NOTE: This may update Ops and Mask. | |||
| 20176 | if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( | |||
| 20177 | Ops, Mask, VT.getSizeInBits(), DL, DAG, Subtarget)) | |||
| 20178 | return DAG.getBitcast(VT, HOp); | |||
| 20179 | ||||
| 20180 | V1 = DAG.getBitcast(VT, Ops[0]); | |||
| 20181 | V2 = DAG.getBitcast(VT, Ops[1]); | |||
| 20182 | assert(NumElements == (int)Mask.size() &&(static_cast <bool> (NumElements == (int)Mask.size() && "canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size" ) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__ __PRETTY_FUNCTION__)) | |||
| 20183 | "canonicalizeShuffleMaskWithHorizOp "(static_cast <bool> (NumElements == (int)Mask.size() && "canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size" ) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__ __PRETTY_FUNCTION__)) | |||
| 20184 | "shouldn't alter the shuffle mask size")(static_cast <bool> (NumElements == (int)Mask.size() && "canonicalizeShuffleMaskWithHorizOp " "shouldn't alter the shuffle mask size" ) ? void (0) : __assert_fail ("NumElements == (int)Mask.size() && \"canonicalizeShuffleMaskWithHorizOp \" \"shouldn't alter the shuffle mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20184, __extension__ __PRETTY_FUNCTION__)); | |||
| 20185 | ||||
| 20186 | // Commute the shuffle if it will improve canonicalization. | |||
| 20187 | if (canonicalizeShuffleMaskWithCommute(Mask)) { | |||
| 20188 | ShuffleVectorSDNode::commuteMask(Mask); | |||
| 20189 | std::swap(V1, V2); | |||
| 20190 | } | |||
| 20191 | ||||
| 20192 | // For each vector width, delegate to a specialized lowering routine. | |||
| 20193 | if (VT.is128BitVector()) | |||
| 20194 | return lower128BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); | |||
| 20195 | ||||
| 20196 | if (VT.is256BitVector()) | |||
| 20197 | return lower256BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); | |||
| 20198 | ||||
| 20199 | if (VT.is512BitVector()) | |||
| 20200 | return lower512BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); | |||
| 20201 | ||||
| 20202 | if (Is1BitVector) | |||
| 20203 | return lower1BitShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget, DAG); | |||
| 20204 | ||||
| 20205 | llvm_unreachable("Unimplemented!")::llvm::llvm_unreachable_internal("Unimplemented!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 20205); | |||
| 20206 | } | |||
| 20207 | ||||
| 20208 | /// Try to lower a VSELECT instruction to a vector shuffle. | |||
| 20209 | static SDValue lowerVSELECTtoVectorShuffle(SDValue Op, | |||
| 20210 | const X86Subtarget &Subtarget, | |||
| 20211 | SelectionDAG &DAG) { | |||
| 20212 | SDValue Cond = Op.getOperand(0); | |||
| 20213 | SDValue LHS = Op.getOperand(1); | |||
| 20214 | SDValue RHS = Op.getOperand(2); | |||
| 20215 | MVT VT = Op.getSimpleValueType(); | |||
| 20216 | ||||
| 20217 | // Only non-legal VSELECTs reach this lowering, convert those into generic | |||
| 20218 | // shuffles and re-use the shuffle lowering path for blends. | |||
| 20219 | if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) { | |||
| 20220 | SmallVector<int, 32> Mask; | |||
| 20221 | if (createShuffleMaskFromVSELECT(Mask, Cond)) | |||
| 20222 | return DAG.getVectorShuffle(VT, SDLoc(Op), LHS, RHS, Mask); | |||
| 20223 | } | |||
| 20224 | ||||
| 20225 | return SDValue(); | |||
| 20226 | } | |||
| 20227 | ||||
| 20228 | SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { | |||
| 20229 | SDValue Cond = Op.getOperand(0); | |||
| 20230 | SDValue LHS = Op.getOperand(1); | |||
| 20231 | SDValue RHS = Op.getOperand(2); | |||
| 20232 | ||||
| 20233 | SDLoc dl(Op); | |||
| 20234 | MVT VT = Op.getSimpleValueType(); | |||
| 20235 | if (isSoftFP16(VT)) { | |||
| 20236 | MVT NVT = VT.changeVectorElementTypeToInteger(); | |||
| 20237 | return DAG.getBitcast(VT, DAG.getNode(ISD::VSELECT, dl, NVT, Cond, | |||
| 20238 | DAG.getBitcast(NVT, LHS), | |||
| 20239 | DAG.getBitcast(NVT, RHS))); | |||
| 20240 | } | |||
| 20241 | ||||
| 20242 | // A vselect where all conditions and data are constants can be optimized into | |||
| 20243 | // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR(). | |||
| 20244 | if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) && | |||
| 20245 | ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()) && | |||
| 20246 | ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) | |||
| 20247 | return SDValue(); | |||
| 20248 | ||||
| 20249 | // Try to lower this to a blend-style vector shuffle. This can handle all | |||
| 20250 | // constant condition cases. | |||
| 20251 | if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) | |||
| 20252 | return BlendOp; | |||
| 20253 | ||||
| 20254 | // If this VSELECT has a vector if i1 as a mask, it will be directly matched | |||
| 20255 | // with patterns on the mask registers on AVX-512. | |||
| 20256 | MVT CondVT = Cond.getSimpleValueType(); | |||
| 20257 | unsigned CondEltSize = Cond.getScalarValueSizeInBits(); | |||
| 20258 | if (CondEltSize == 1) | |||
| 20259 | return Op; | |||
| 20260 | ||||
| 20261 | // Variable blends are only legal from SSE4.1 onward. | |||
| 20262 | if (!Subtarget.hasSSE41()) | |||
| 20263 | return SDValue(); | |||
| 20264 | ||||
| 20265 | unsigned EltSize = VT.getScalarSizeInBits(); | |||
| 20266 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 20267 | ||||
| 20268 | // Expand v32i16/v64i8 without BWI. | |||
| 20269 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) | |||
| 20270 | return SDValue(); | |||
| 20271 | ||||
| 20272 | // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition | |||
| 20273 | // into an i1 condition so that we can use the mask-based 512-bit blend | |||
| 20274 | // instructions. | |||
| 20275 | if (VT.getSizeInBits() == 512) { | |||
| 20276 | // Build a mask by testing the condition against zero. | |||
| 20277 | MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); | |||
| 20278 | SDValue Mask = DAG.getSetCC(dl, MaskVT, Cond, | |||
| 20279 | DAG.getConstant(0, dl, CondVT), | |||
| 20280 | ISD::SETNE); | |||
| 20281 | // Now return a new VSELECT using the mask. | |||
| 20282 | return DAG.getSelect(dl, VT, Mask, LHS, RHS); | |||
| 20283 | } | |||
| 20284 | ||||
| 20285 | // SEXT/TRUNC cases where the mask doesn't match the destination size. | |||
| 20286 | if (CondEltSize != EltSize) { | |||
| 20287 | // If we don't have a sign splat, rely on the expansion. | |||
| 20288 | if (CondEltSize != DAG.ComputeNumSignBits(Cond)) | |||
| 20289 | return SDValue(); | |||
| 20290 | ||||
| 20291 | MVT NewCondSVT = MVT::getIntegerVT(EltSize); | |||
| 20292 | MVT NewCondVT = MVT::getVectorVT(NewCondSVT, NumElts); | |||
| 20293 | Cond = DAG.getSExtOrTrunc(Cond, dl, NewCondVT); | |||
| 20294 | return DAG.getNode(ISD::VSELECT, dl, VT, Cond, LHS, RHS); | |||
| 20295 | } | |||
| 20296 | ||||
| 20297 | // Only some types will be legal on some subtargets. If we can emit a legal | |||
| 20298 | // VSELECT-matching blend, return Op, and but if we need to expand, return | |||
| 20299 | // a null value. | |||
| 20300 | switch (VT.SimpleTy) { | |||
| 20301 | default: | |||
| 20302 | // Most of the vector types have blends past SSE4.1. | |||
| 20303 | return Op; | |||
| 20304 | ||||
| 20305 | case MVT::v32i8: | |||
| 20306 | // The byte blends for AVX vectors were introduced only in AVX2. | |||
| 20307 | if (Subtarget.hasAVX2()) | |||
| 20308 | return Op; | |||
| 20309 | ||||
| 20310 | return SDValue(); | |||
| 20311 | ||||
| 20312 | case MVT::v8i16: | |||
| 20313 | case MVT::v16i16: { | |||
| 20314 | // Bitcast everything to the vXi8 type and use a vXi8 vselect. | |||
| 20315 | MVT CastVT = MVT::getVectorVT(MVT::i8, NumElts * 2); | |||
| 20316 | Cond = DAG.getBitcast(CastVT, Cond); | |||
| 20317 | LHS = DAG.getBitcast(CastVT, LHS); | |||
| 20318 | RHS = DAG.getBitcast(CastVT, RHS); | |||
| 20319 | SDValue Select = DAG.getNode(ISD::VSELECT, dl, CastVT, Cond, LHS, RHS); | |||
| 20320 | return DAG.getBitcast(VT, Select); | |||
| 20321 | } | |||
| 20322 | } | |||
| 20323 | } | |||
| 20324 | ||||
| 20325 | static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { | |||
| 20326 | MVT VT = Op.getSimpleValueType(); | |||
| 20327 | SDValue Vec = Op.getOperand(0); | |||
| 20328 | SDValue Idx = Op.getOperand(1); | |||
| 20329 | assert(isa<ConstantSDNode>(Idx) && "Constant index expected")(static_cast <bool> (isa<ConstantSDNode>(Idx) && "Constant index expected") ? void (0) : __assert_fail ("isa<ConstantSDNode>(Idx) && \"Constant index expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20329, __extension__ __PRETTY_FUNCTION__)); | |||
| 20330 | SDLoc dl(Op); | |||
| 20331 | ||||
| 20332 | if (!Vec.getSimpleValueType().is128BitVector()) | |||
| 20333 | return SDValue(); | |||
| 20334 | ||||
| 20335 | if (VT.getSizeInBits() == 8) { | |||
| 20336 | // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless | |||
| 20337 | // we're going to zero extend the register or fold the store. | |||
| 20338 | if (llvm::isNullConstant(Idx) && !X86::mayFoldIntoZeroExtend(Op) && | |||
| 20339 | !X86::mayFoldIntoStore(Op)) | |||
| 20340 | return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, | |||
| 20341 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, | |||
| 20342 | DAG.getBitcast(MVT::v4i32, Vec), Idx)); | |||
| 20343 | ||||
| 20344 | unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); | |||
| 20345 | SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, | |||
| 20346 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 20347 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); | |||
| 20348 | } | |||
| 20349 | ||||
| 20350 | if (VT == MVT::f32) { | |||
| 20351 | // EXTRACTPS outputs to a GPR32 register which will require a movd to copy | |||
| 20352 | // the result back to FR32 register. It's only worth matching if the | |||
| 20353 | // result has a single use which is a store or a bitcast to i32. And in | |||
| 20354 | // the case of a store, it's not worth it if the index is a constant 0, | |||
| 20355 | // because a MOVSSmr can be used instead, which is smaller and faster. | |||
| 20356 | if (!Op.hasOneUse()) | |||
| 20357 | return SDValue(); | |||
| 20358 | SDNode *User = *Op.getNode()->use_begin(); | |||
| 20359 | if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) && | |||
| 20360 | (User->getOpcode() != ISD::BITCAST || | |||
| 20361 | User->getValueType(0) != MVT::i32)) | |||
| 20362 | return SDValue(); | |||
| 20363 | SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, | |||
| 20364 | DAG.getBitcast(MVT::v4i32, Vec), Idx); | |||
| 20365 | return DAG.getBitcast(MVT::f32, Extract); | |||
| 20366 | } | |||
| 20367 | ||||
| 20368 | if (VT == MVT::i32 || VT == MVT::i64) | |||
| 20369 | return Op; | |||
| 20370 | ||||
| 20371 | return SDValue(); | |||
| 20372 | } | |||
| 20373 | ||||
| 20374 | /// Extract one bit from mask vector, like v16i1 or v8i1. | |||
| 20375 | /// AVX-512 feature. | |||
| 20376 | static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, | |||
| 20377 | const X86Subtarget &Subtarget) { | |||
| 20378 | SDValue Vec = Op.getOperand(0); | |||
| 20379 | SDLoc dl(Vec); | |||
| 20380 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 20381 | SDValue Idx = Op.getOperand(1); | |||
| 20382 | auto* IdxC = dyn_cast<ConstantSDNode>(Idx); | |||
| 20383 | MVT EltVT = Op.getSimpleValueType(); | |||
| 20384 | ||||
| 20385 | assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&(static_cast <bool> ((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector" ) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__ __PRETTY_FUNCTION__)) | |||
| 20386 | "Unexpected vector type in ExtractBitFromMaskVector")(static_cast <bool> ((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && "Unexpected vector type in ExtractBitFromMaskVector" ) ? void (0) : __assert_fail ("(VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) && \"Unexpected vector type in ExtractBitFromMaskVector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20386, __extension__ __PRETTY_FUNCTION__)); | |||
| 20387 | ||||
| 20388 | // variable index can't be handled in mask registers, | |||
| 20389 | // extend vector to VR512/128 | |||
| 20390 | if (!IdxC) { | |||
| 20391 | unsigned NumElts = VecVT.getVectorNumElements(); | |||
| 20392 | // Extending v8i1/v16i1 to 512-bit get better performance on KNL | |||
| 20393 | // than extending to 128/256bit. | |||
| 20394 | MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; | |||
| 20395 | MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); | |||
| 20396 | SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec); | |||
| 20397 | SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtEltVT, Ext, Idx); | |||
| 20398 | return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); | |||
| 20399 | } | |||
| 20400 | ||||
| 20401 | unsigned IdxVal = IdxC->getZExtValue(); | |||
| 20402 | if (IdxVal == 0) // the operation is legal | |||
| 20403 | return Op; | |||
| 20404 | ||||
| 20405 | // Extend to natively supported kshift. | |||
| 20406 | unsigned NumElems = VecVT.getVectorNumElements(); | |||
| 20407 | MVT WideVecVT = VecVT; | |||
| 20408 | if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { | |||
| 20409 | WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 20410 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, | |||
| 20411 | DAG.getUNDEF(WideVecVT), Vec, | |||
| 20412 | DAG.getIntPtrConstant(0, dl)); | |||
| 20413 | } | |||
| 20414 | ||||
| 20415 | // Use kshiftr instruction to move to the lower element. | |||
| 20416 | Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, | |||
| 20417 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 20418 | ||||
| 20419 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, | |||
| 20420 | DAG.getIntPtrConstant(0, dl)); | |||
| 20421 | } | |||
| 20422 | ||||
| 20423 | SDValue | |||
| 20424 | X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, | |||
| 20425 | SelectionDAG &DAG) const { | |||
| 20426 | SDLoc dl(Op); | |||
| 20427 | SDValue Vec = Op.getOperand(0); | |||
| 20428 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 20429 | SDValue Idx = Op.getOperand(1); | |||
| 20430 | auto* IdxC = dyn_cast<ConstantSDNode>(Idx); | |||
| 20431 | ||||
| 20432 | if (VecVT.getVectorElementType() == MVT::i1) | |||
| 20433 | return ExtractBitFromMaskVector(Op, DAG, Subtarget); | |||
| 20434 | ||||
| 20435 | if (!IdxC) { | |||
| 20436 | // Its more profitable to go through memory (1 cycles throughput) | |||
| 20437 | // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput) | |||
| 20438 | // IACA tool was used to get performance estimation | |||
| 20439 | // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer) | |||
| 20440 | // | |||
| 20441 | // example : extractelement <16 x i8> %a, i32 %i | |||
| 20442 | // | |||
| 20443 | // Block Throughput: 3.00 Cycles | |||
| 20444 | // Throughput Bottleneck: Port5 | |||
| 20445 | // | |||
| 20446 | // | Num Of | Ports pressure in cycles | | | |||
| 20447 | // | Uops | 0 - DV | 5 | 6 | 7 | | | |||
| 20448 | // --------------------------------------------- | |||
| 20449 | // | 1 | | 1.0 | | | CP | vmovd xmm1, edi | |||
| 20450 | // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1 | |||
| 20451 | // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0 | |||
| 20452 | // Total Num Of Uops: 4 | |||
| 20453 | // | |||
| 20454 | // | |||
| 20455 | // Block Throughput: 1.00 Cycles | |||
| 20456 | // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4 | |||
| 20457 | // | |||
| 20458 | // | | Ports pressure in cycles | | | |||
| 20459 | // |Uops| 1 | 2 - D |3 - D | 4 | 5 | | | |||
| 20460 | // --------------------------------------------------------- | |||
| 20461 | // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0 | |||
| 20462 | // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18] | |||
| 20463 | // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1] | |||
| 20464 | // Total Num Of Uops: 4 | |||
| 20465 | ||||
| 20466 | return SDValue(); | |||
| 20467 | } | |||
| 20468 | ||||
| 20469 | unsigned IdxVal = IdxC->getZExtValue(); | |||
| 20470 | ||||
| 20471 | // If this is a 256-bit vector result, first extract the 128-bit vector and | |||
| 20472 | // then extract the element from the 128-bit vector. | |||
| 20473 | if (VecVT.is256BitVector() || VecVT.is512BitVector()) { | |||
| 20474 | // Get the 128-bit vector. | |||
| 20475 | Vec = extract128BitVector(Vec, IdxVal, DAG, dl); | |||
| 20476 | MVT EltVT = VecVT.getVectorElementType(); | |||
| 20477 | ||||
| 20478 | unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits(); | |||
| 20479 | assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2")(static_cast <bool> (isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2") ? void (0) : __assert_fail ("isPowerOf2_32(ElemsPerChunk) && \"Elements per chunk not power of 2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20479, __extension__ __PRETTY_FUNCTION__)); | |||
| 20480 | ||||
| 20481 | // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2 | |||
| 20482 | // this can be done with a mask. | |||
| 20483 | IdxVal &= ElemsPerChunk - 1; | |||
| 20484 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec, | |||
| 20485 | DAG.getIntPtrConstant(IdxVal, dl)); | |||
| 20486 | } | |||
| 20487 | ||||
| 20488 | assert(VecVT.is128BitVector() && "Unexpected vector length")(static_cast <bool> (VecVT.is128BitVector() && "Unexpected vector length" ) ? void (0) : __assert_fail ("VecVT.is128BitVector() && \"Unexpected vector length\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20488, __extension__ __PRETTY_FUNCTION__)); | |||
| 20489 | ||||
| 20490 | MVT VT = Op.getSimpleValueType(); | |||
| 20491 | ||||
| 20492 | if (VT == MVT::i16) { | |||
| 20493 | // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless | |||
| 20494 | // we're going to zero extend the register or fold the store (SSE41 only). | |||
| 20495 | if (IdxVal == 0 && !X86::mayFoldIntoZeroExtend(Op) && | |||
| 20496 | !(Subtarget.hasSSE41() && X86::mayFoldIntoStore(Op))) { | |||
| 20497 | if (Subtarget.hasFP16()) | |||
| 20498 | return Op; | |||
| 20499 | ||||
| 20500 | return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, | |||
| 20501 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, | |||
| 20502 | DAG.getBitcast(MVT::v4i32, Vec), Idx)); | |||
| 20503 | } | |||
| 20504 | ||||
| 20505 | SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, | |||
| 20506 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 20507 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); | |||
| 20508 | } | |||
| 20509 | ||||
| 20510 | if (Subtarget.hasSSE41()) | |||
| 20511 | if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG)) | |||
| 20512 | return Res; | |||
| 20513 | ||||
| 20514 | // TODO: We only extract a single element from v16i8, we can probably afford | |||
| 20515 | // to be more aggressive here before using the default approach of spilling to | |||
| 20516 | // stack. | |||
| 20517 | if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) { | |||
| 20518 | // Extract either the lowest i32 or any i16, and extract the sub-byte. | |||
| 20519 | int DWordIdx = IdxVal / 4; | |||
| 20520 | if (DWordIdx == 0) { | |||
| 20521 | SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, | |||
| 20522 | DAG.getBitcast(MVT::v4i32, Vec), | |||
| 20523 | DAG.getIntPtrConstant(DWordIdx, dl)); | |||
| 20524 | int ShiftVal = (IdxVal % 4) * 8; | |||
| 20525 | if (ShiftVal != 0) | |||
| 20526 | Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res, | |||
| 20527 | DAG.getConstant(ShiftVal, dl, MVT::i8)); | |||
| 20528 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 20529 | } | |||
| 20530 | ||||
| 20531 | int WordIdx = IdxVal / 2; | |||
| 20532 | SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, | |||
| 20533 | DAG.getBitcast(MVT::v8i16, Vec), | |||
| 20534 | DAG.getIntPtrConstant(WordIdx, dl)); | |||
| 20535 | int ShiftVal = (IdxVal % 2) * 8; | |||
| 20536 | if (ShiftVal != 0) | |||
| 20537 | Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res, | |||
| 20538 | DAG.getConstant(ShiftVal, dl, MVT::i8)); | |||
| 20539 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 20540 | } | |||
| 20541 | ||||
| 20542 | if (VT == MVT::f16 || VT.getSizeInBits() == 32) { | |||
| 20543 | if (IdxVal == 0) | |||
| 20544 | return Op; | |||
| 20545 | ||||
| 20546 | // Shuffle the element to the lowest element, then movss or movsh. | |||
| 20547 | SmallVector<int, 8> Mask(VecVT.getVectorNumElements(), -1); | |||
| 20548 | Mask[0] = static_cast<int>(IdxVal); | |||
| 20549 | Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); | |||
| 20550 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, | |||
| 20551 | DAG.getIntPtrConstant(0, dl)); | |||
| 20552 | } | |||
| 20553 | ||||
| 20554 | if (VT.getSizeInBits() == 64) { | |||
| 20555 | // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b | |||
| 20556 | // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught | |||
| 20557 | // to match extract_elt for f64. | |||
| 20558 | if (IdxVal == 0) | |||
| 20559 | return Op; | |||
| 20560 | ||||
| 20561 | // UNPCKHPD the element to the lowest double word, then movsd. | |||
| 20562 | // Note if the lower 64 bits of the result of the UNPCKHPD is then stored | |||
| 20563 | // to a f64mem, the whole operation is folded into a single MOVHPDmr. | |||
| 20564 | int Mask[2] = { 1, -1 }; | |||
| 20565 | Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask); | |||
| 20566 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec, | |||
| 20567 | DAG.getIntPtrConstant(0, dl)); | |||
| 20568 | } | |||
| 20569 | ||||
| 20570 | return SDValue(); | |||
| 20571 | } | |||
| 20572 | ||||
| 20573 | /// Insert one bit to mask vector, like v16i1 or v8i1. | |||
| 20574 | /// AVX-512 feature. | |||
| 20575 | static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, | |||
| 20576 | const X86Subtarget &Subtarget) { | |||
| 20577 | SDLoc dl(Op); | |||
| 20578 | SDValue Vec = Op.getOperand(0); | |||
| 20579 | SDValue Elt = Op.getOperand(1); | |||
| 20580 | SDValue Idx = Op.getOperand(2); | |||
| 20581 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 20582 | ||||
| 20583 | if (!isa<ConstantSDNode>(Idx)) { | |||
| 20584 | // Non constant index. Extend source and destination, | |||
| 20585 | // insert element and then truncate the result. | |||
| 20586 | unsigned NumElts = VecVT.getVectorNumElements(); | |||
| 20587 | MVT ExtEltVT = (NumElts <= 8) ? MVT::getIntegerVT(128 / NumElts) : MVT::i8; | |||
| 20588 | MVT ExtVecVT = MVT::getVectorVT(ExtEltVT, NumElts); | |||
| 20589 | SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, | |||
| 20590 | DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVecVT, Vec), | |||
| 20591 | DAG.getNode(ISD::SIGN_EXTEND, dl, ExtEltVT, Elt), Idx); | |||
| 20592 | return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp); | |||
| 20593 | } | |||
| 20594 | ||||
| 20595 | // Copy into a k-register, extract to v1i1 and insert_subvector. | |||
| 20596 | SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt); | |||
| 20597 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx); | |||
| 20598 | } | |||
| 20599 | ||||
| 20600 | SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, | |||
| 20601 | SelectionDAG &DAG) const { | |||
| 20602 | MVT VT = Op.getSimpleValueType(); | |||
| 20603 | MVT EltVT = VT.getVectorElementType(); | |||
| 20604 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 20605 | unsigned EltSizeInBits = EltVT.getScalarSizeInBits(); | |||
| 20606 | ||||
| 20607 | if (EltVT == MVT::i1) | |||
| 20608 | return InsertBitToMaskVector(Op, DAG, Subtarget); | |||
| 20609 | ||||
| 20610 | SDLoc dl(Op); | |||
| 20611 | SDValue N0 = Op.getOperand(0); | |||
| 20612 | SDValue N1 = Op.getOperand(1); | |||
| 20613 | SDValue N2 = Op.getOperand(2); | |||
| 20614 | auto *N2C = dyn_cast<ConstantSDNode>(N2); | |||
| 20615 | ||||
| 20616 | if (!N2C) { | |||
| 20617 | // Variable insertion indices, usually we're better off spilling to stack, | |||
| 20618 | // but AVX512 can use a variable compare+select by comparing against all | |||
| 20619 | // possible vector indices, and FP insertion has less gpr->simd traffic. | |||
| 20620 | if (!(Subtarget.hasBWI() || | |||
| 20621 | (Subtarget.hasAVX512() && EltSizeInBits >= 32) || | |||
| 20622 | (Subtarget.hasSSE41() && (EltVT == MVT::f32 || EltVT == MVT::f64)))) | |||
| 20623 | return SDValue(); | |||
| 20624 | ||||
| 20625 | MVT IdxSVT = MVT::getIntegerVT(EltSizeInBits); | |||
| 20626 | MVT IdxVT = MVT::getVectorVT(IdxSVT, NumElts); | |||
| 20627 | if (!isTypeLegal(IdxSVT) || !isTypeLegal(IdxVT)) | |||
| 20628 | return SDValue(); | |||
| 20629 | ||||
| 20630 | SDValue IdxExt = DAG.getZExtOrTrunc(N2, dl, IdxSVT); | |||
| 20631 | SDValue IdxSplat = DAG.getSplatBuildVector(IdxVT, dl, IdxExt); | |||
| 20632 | SDValue EltSplat = DAG.getSplatBuildVector(VT, dl, N1); | |||
| 20633 | ||||
| 20634 | SmallVector<SDValue, 16> RawIndices; | |||
| 20635 | for (unsigned I = 0; I != NumElts; ++I) | |||
| 20636 | RawIndices.push_back(DAG.getConstant(I, dl, IdxSVT)); | |||
| 20637 | SDValue Indices = DAG.getBuildVector(IdxVT, dl, RawIndices); | |||
| 20638 | ||||
| 20639 | // inselt N0, N1, N2 --> select (SplatN2 == {0,1,2...}) ? SplatN1 : N0. | |||
| 20640 | return DAG.getSelectCC(dl, IdxSplat, Indices, EltSplat, N0, | |||
| 20641 | ISD::CondCode::SETEQ); | |||
| 20642 | } | |||
| 20643 | ||||
| 20644 | if (N2C->getAPIntValue().uge(NumElts)) | |||
| 20645 | return SDValue(); | |||
| 20646 | uint64_t IdxVal = N2C->getZExtValue(); | |||
| 20647 | ||||
| 20648 | bool IsZeroElt = X86::isZeroNode(N1); | |||
| 20649 | bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1); | |||
| 20650 | ||||
| 20651 | if (IsZeroElt || IsAllOnesElt) { | |||
| 20652 | // Lower insertion of v16i8/v32i8/v64i16 -1 elts as an 'OR' blend. | |||
| 20653 | // We don't deal with i8 0 since it appears to be handled elsewhere. | |||
| 20654 | if (IsAllOnesElt && | |||
| 20655 | ((VT == MVT::v16i8 && !Subtarget.hasSSE41()) || | |||
| 20656 | ((VT == MVT::v32i8 || VT == MVT::v16i16) && !Subtarget.hasInt256()))) { | |||
| 20657 | SDValue ZeroCst = DAG.getConstant(0, dl, VT.getScalarType()); | |||
| 20658 | SDValue OnesCst = DAG.getAllOnesConstant(dl, VT.getScalarType()); | |||
| 20659 | SmallVector<SDValue, 8> CstVectorElts(NumElts, ZeroCst); | |||
| 20660 | CstVectorElts[IdxVal] = OnesCst; | |||
| 20661 | SDValue CstVector = DAG.getBuildVector(VT, dl, CstVectorElts); | |||
| 20662 | return DAG.getNode(ISD::OR, dl, VT, N0, CstVector); | |||
| 20663 | } | |||
| 20664 | // See if we can do this more efficiently with a blend shuffle with a | |||
| 20665 | // rematerializable vector. | |||
| 20666 | if (Subtarget.hasSSE41() && | |||
| 20667 | (EltSizeInBits >= 16 || (IsZeroElt && !VT.is128BitVector()))) { | |||
| 20668 | SmallVector<int, 8> BlendMask; | |||
| 20669 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 20670 | BlendMask.push_back(i == IdxVal ? i + NumElts : i); | |||
| 20671 | SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) | |||
| 20672 | : getOnesVector(VT, DAG, dl); | |||
| 20673 | return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); | |||
| 20674 | } | |||
| 20675 | } | |||
| 20676 | ||||
| 20677 | // If the vector is wider than 128 bits, extract the 128-bit subvector, insert | |||
| 20678 | // into that, and then insert the subvector back into the result. | |||
| 20679 | if (VT.is256BitVector() || VT.is512BitVector()) { | |||
| 20680 | // With a 256-bit vector, we can insert into the zero element efficiently | |||
| 20681 | // using a blend if we have AVX or AVX2 and the right data type. | |||
| 20682 | if (VT.is256BitVector() && IdxVal == 0) { | |||
| 20683 | // TODO: It is worthwhile to cast integer to floating point and back | |||
| 20684 | // and incur a domain crossing penalty if that's what we'll end up | |||
| 20685 | // doing anyway after extracting to a 128-bit vector. | |||
| 20686 | if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) || | |||
| 20687 | (Subtarget.hasAVX2() && (EltVT == MVT::i32 || EltVT == MVT::i64))) { | |||
| 20688 | SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); | |||
| 20689 | return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, | |||
| 20690 | DAG.getTargetConstant(1, dl, MVT::i8)); | |||
| 20691 | } | |||
| 20692 | } | |||
| 20693 | ||||
| 20694 | unsigned NumEltsIn128 = 128 / EltSizeInBits; | |||
| 20695 | assert(isPowerOf2_32(NumEltsIn128) &&(static_cast <bool> (isPowerOf2_32(NumEltsIn128) && "Vectors will always have power-of-two number of elements.") ? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__ __PRETTY_FUNCTION__)) | |||
| 20696 | "Vectors will always have power-of-two number of elements.")(static_cast <bool> (isPowerOf2_32(NumEltsIn128) && "Vectors will always have power-of-two number of elements.") ? void (0) : __assert_fail ("isPowerOf2_32(NumEltsIn128) && \"Vectors will always have power-of-two number of elements.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20696, __extension__ __PRETTY_FUNCTION__)); | |||
| 20697 | ||||
| 20698 | // If we are not inserting into the low 128-bit vector chunk, | |||
| 20699 | // then prefer the broadcast+blend sequence. | |||
| 20700 | // FIXME: relax the profitability check iff all N1 uses are insertions. | |||
| 20701 | if (IdxVal >= NumEltsIn128 && | |||
| 20702 | ((Subtarget.hasAVX2() && EltSizeInBits != 8) || | |||
| 20703 | (Subtarget.hasAVX() && (EltSizeInBits >= 32) && | |||
| 20704 | X86::mayFoldLoad(N1, Subtarget)))) { | |||
| 20705 | SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); | |||
| 20706 | SmallVector<int, 8> BlendMask; | |||
| 20707 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 20708 | BlendMask.push_back(i == IdxVal ? i + NumElts : i); | |||
| 20709 | return DAG.getVectorShuffle(VT, dl, N0, N1SplatVec, BlendMask); | |||
| 20710 | } | |||
| 20711 | ||||
| 20712 | // Get the desired 128-bit vector chunk. | |||
| 20713 | SDValue V = extract128BitVector(N0, IdxVal, DAG, dl); | |||
| 20714 | ||||
| 20715 | // Insert the element into the desired chunk. | |||
| 20716 | // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo. | |||
| 20717 | unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1); | |||
| 20718 | ||||
| 20719 | V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1, | |||
| 20720 | DAG.getIntPtrConstant(IdxIn128, dl)); | |||
| 20721 | ||||
| 20722 | // Insert the changed part back into the bigger vector | |||
| 20723 | return insert128BitVector(N0, V, IdxVal, DAG, dl); | |||
| 20724 | } | |||
| 20725 | assert(VT.is128BitVector() && "Only 128-bit vector types should be left!")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector types should be left!" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector types should be left!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20725, __extension__ __PRETTY_FUNCTION__)); | |||
| 20726 | ||||
| 20727 | // This will be just movw/movd/movq/movsh/movss/movsd. | |||
| 20728 | if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) { | |||
| 20729 | if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 || | |||
| 20730 | EltVT == MVT::f16 || EltVT == MVT::i64) { | |||
| 20731 | N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1); | |||
| 20732 | return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); | |||
| 20733 | } | |||
| 20734 | ||||
| 20735 | // We can't directly insert an i8 or i16 into a vector, so zero extend | |||
| 20736 | // it to i32 first. | |||
| 20737 | if (EltVT == MVT::i16 || EltVT == MVT::i8) { | |||
| 20738 | N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1); | |||
| 20739 | MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); | |||
| 20740 | N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1); | |||
| 20741 | N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG); | |||
| 20742 | return DAG.getBitcast(VT, N1); | |||
| 20743 | } | |||
| 20744 | } | |||
| 20745 | ||||
| 20746 | // Transform it so it match pinsr{b,w} which expects a GR32 as its second | |||
| 20747 | // argument. SSE41 required for pinsrb. | |||
| 20748 | if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) { | |||
| 20749 | unsigned Opc; | |||
| 20750 | if (VT == MVT::v8i16) { | |||
| 20751 | assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW")(static_cast <bool> (Subtarget.hasSSE2() && "SSE2 required for PINSRW" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"SSE2 required for PINSRW\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20751, __extension__ __PRETTY_FUNCTION__)); | |||
| 20752 | Opc = X86ISD::PINSRW; | |||
| 20753 | } else { | |||
| 20754 | assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector")(static_cast <bool> (VT == MVT::v16i8 && "PINSRB requires v16i8 vector" ) ? void (0) : __assert_fail ("VT == MVT::v16i8 && \"PINSRB requires v16i8 vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20754, __extension__ __PRETTY_FUNCTION__)); | |||
| 20755 | assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB")(static_cast <bool> (Subtarget.hasSSE41() && "SSE41 required for PINSRB" ) ? void (0) : __assert_fail ("Subtarget.hasSSE41() && \"SSE41 required for PINSRB\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20755, __extension__ __PRETTY_FUNCTION__)); | |||
| 20756 | Opc = X86ISD::PINSRB; | |||
| 20757 | } | |||
| 20758 | ||||
| 20759 | assert(N1.getValueType() != MVT::i32 && "Unexpected VT")(static_cast <bool> (N1.getValueType() != MVT::i32 && "Unexpected VT") ? void (0) : __assert_fail ("N1.getValueType() != MVT::i32 && \"Unexpected VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20759, __extension__ __PRETTY_FUNCTION__)); | |||
| 20760 | N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1); | |||
| 20761 | N2 = DAG.getTargetConstant(IdxVal, dl, MVT::i8); | |||
| 20762 | return DAG.getNode(Opc, dl, VT, N0, N1, N2); | |||
| 20763 | } | |||
| 20764 | ||||
| 20765 | if (Subtarget.hasSSE41()) { | |||
| 20766 | if (EltVT == MVT::f32) { | |||
| 20767 | // Bits [7:6] of the constant are the source select. This will always be | |||
| 20768 | // zero here. The DAG Combiner may combine an extract_elt index into | |||
| 20769 | // these bits. For example (insert (extract, 3), 2) could be matched by | |||
| 20770 | // putting the '3' into bits [7:6] of X86ISD::INSERTPS. | |||
| 20771 | // Bits [5:4] of the constant are the destination select. This is the | |||
| 20772 | // value of the incoming immediate. | |||
| 20773 | // Bits [3:0] of the constant are the zero mask. The DAG Combiner may | |||
| 20774 | // combine either bitwise AND or insert of float 0.0 to set these bits. | |||
| 20775 | ||||
| 20776 | bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); | |||
| 20777 | if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) { | |||
| 20778 | // If this is an insertion of 32-bits into the low 32-bits of | |||
| 20779 | // a vector, we prefer to generate a blend with immediate rather | |||
| 20780 | // than an insertps. Blends are simpler operations in hardware and so | |||
| 20781 | // will always have equal or better performance than insertps. | |||
| 20782 | // But if optimizing for size and there's a load folding opportunity, | |||
| 20783 | // generate insertps because blendps does not have a 32-bit memory | |||
| 20784 | // operand form. | |||
| 20785 | N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); | |||
| 20786 | return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, | |||
| 20787 | DAG.getTargetConstant(1, dl, MVT::i8)); | |||
| 20788 | } | |||
| 20789 | // Create this as a scalar to vector.. | |||
| 20790 | N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1); | |||
| 20791 | return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, | |||
| 20792 | DAG.getTargetConstant(IdxVal << 4, dl, MVT::i8)); | |||
| 20793 | } | |||
| 20794 | ||||
| 20795 | // PINSR* works with constant index. | |||
| 20796 | if (EltVT == MVT::i32 || EltVT == MVT::i64) | |||
| 20797 | return Op; | |||
| 20798 | } | |||
| 20799 | ||||
| 20800 | return SDValue(); | |||
| 20801 | } | |||
| 20802 | ||||
| 20803 | static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, | |||
| 20804 | SelectionDAG &DAG) { | |||
| 20805 | SDLoc dl(Op); | |||
| 20806 | MVT OpVT = Op.getSimpleValueType(); | |||
| 20807 | ||||
| 20808 | // It's always cheaper to replace a xor+movd with xorps and simplifies further | |||
| 20809 | // combines. | |||
| 20810 | if (X86::isZeroNode(Op.getOperand(0))) | |||
| 20811 | return getZeroVector(OpVT, Subtarget, DAG, dl); | |||
| 20812 | ||||
| 20813 | // If this is a 256-bit vector result, first insert into a 128-bit | |||
| 20814 | // vector and then insert into the 256-bit vector. | |||
| 20815 | if (!OpVT.is128BitVector()) { | |||
| 20816 | // Insert into a 128-bit vector. | |||
| 20817 | unsigned SizeFactor = OpVT.getSizeInBits() / 128; | |||
| 20818 | MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(), | |||
| 20819 | OpVT.getVectorNumElements() / SizeFactor); | |||
| 20820 | ||||
| 20821 | Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0)); | |||
| 20822 | ||||
| 20823 | // Insert the 128-bit vector. | |||
| 20824 | return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl); | |||
| 20825 | } | |||
| 20826 | assert(OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 &&(static_cast <bool> (OpVT.is128BitVector() && OpVT .isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!" ) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__ __PRETTY_FUNCTION__)) | |||
| 20827 | "Expected an SSE type!")(static_cast <bool> (OpVT.is128BitVector() && OpVT .isInteger() && OpVT != MVT::v2i64 && "Expected an SSE type!" ) ? void (0) : __assert_fail ("OpVT.is128BitVector() && OpVT.isInteger() && OpVT != MVT::v2i64 && \"Expected an SSE type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20827, __extension__ __PRETTY_FUNCTION__)); | |||
| 20828 | ||||
| 20829 | // Pass through a v4i32 or V8i16 SCALAR_TO_VECTOR as that's what we use in | |||
| 20830 | // tblgen. | |||
| 20831 | if (OpVT == MVT::v4i32 || (OpVT == MVT::v8i16 && Subtarget.hasFP16())) | |||
| 20832 | return Op; | |||
| 20833 | ||||
| 20834 | SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0)); | |||
| 20835 | return DAG.getBitcast( | |||
| 20836 | OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); | |||
| 20837 | } | |||
| 20838 | ||||
| 20839 | // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a | |||
| 20840 | // simple superregister reference or explicit instructions to insert | |||
| 20841 | // the upper bits of a vector. | |||
| 20842 | static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, | |||
| 20843 | SelectionDAG &DAG) { | |||
| 20844 | assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1)(static_cast <bool> (Op.getSimpleValueType().getVectorElementType () == MVT::i1) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20844, __extension__ __PRETTY_FUNCTION__)); | |||
| 20845 | ||||
| 20846 | return insert1BitVector(Op, DAG, Subtarget); | |||
| 20847 | } | |||
| 20848 | ||||
| 20849 | static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, | |||
| 20850 | SelectionDAG &DAG) { | |||
| 20851 | assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&(static_cast <bool> (Op.getSimpleValueType().getVectorElementType () == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering" ) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__ __PRETTY_FUNCTION__)) | |||
| 20852 | "Only vXi1 extract_subvectors need custom lowering")(static_cast <bool> (Op.getSimpleValueType().getVectorElementType () == MVT::i1 && "Only vXi1 extract_subvectors need custom lowering" ) ? void (0) : __assert_fail ("Op.getSimpleValueType().getVectorElementType() == MVT::i1 && \"Only vXi1 extract_subvectors need custom lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 20852, __extension__ __PRETTY_FUNCTION__)); | |||
| 20853 | ||||
| 20854 | SDLoc dl(Op); | |||
| 20855 | SDValue Vec = Op.getOperand(0); | |||
| 20856 | uint64_t IdxVal = Op.getConstantOperandVal(1); | |||
| 20857 | ||||
| 20858 | if (IdxVal == 0) // the operation is legal | |||
| 20859 | return Op; | |||
| 20860 | ||||
| 20861 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 20862 | unsigned NumElems = VecVT.getVectorNumElements(); | |||
| 20863 | ||||
| 20864 | // Extend to natively supported kshift. | |||
| 20865 | MVT WideVecVT = VecVT; | |||
| 20866 | if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) { | |||
| 20867 | WideVecVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; | |||
| 20868 | Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVecVT, | |||
| 20869 | DAG.getUNDEF(WideVecVT), Vec, | |||
| 20870 | DAG.getIntPtrConstant(0, dl)); | |||
| 20871 | } | |||
| 20872 | ||||
| 20873 | // Shift to the LSB. | |||
| 20874 | Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideVecVT, Vec, | |||
| 20875 | DAG.getTargetConstant(IdxVal, dl, MVT::i8)); | |||
| 20876 | ||||
| 20877 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, Op.getValueType(), Vec, | |||
| 20878 | DAG.getIntPtrConstant(0, dl)); | |||
| 20879 | } | |||
| 20880 | ||||
| 20881 | // Returns the appropriate wrapper opcode for a global reference. | |||
| 20882 | unsigned X86TargetLowering::getGlobalWrapperKind( | |||
| 20883 | const GlobalValue *GV, const unsigned char OpFlags) const { | |||
| 20884 | // References to absolute symbols are never PC-relative. | |||
| 20885 | if (GV && GV->isAbsoluteSymbolRef()) | |||
| 20886 | return X86ISD::Wrapper; | |||
| 20887 | ||||
| 20888 | CodeModel::Model M = getTargetMachine().getCodeModel(); | |||
| 20889 | if (Subtarget.isPICStyleRIPRel() && | |||
| 20890 | (M == CodeModel::Small || M == CodeModel::Kernel)) | |||
| 20891 | return X86ISD::WrapperRIP; | |||
| 20892 | ||||
| 20893 | // In the medium model, functions can always be referenced RIP-relatively, | |||
| 20894 | // since they must be within 2GiB. This is also possible in non-PIC mode, and | |||
| 20895 | // shorter than the 64-bit absolute immediate that would otherwise be emitted. | |||
| 20896 | if (M == CodeModel::Medium && isa_and_nonnull<Function>(GV)) | |||
| 20897 | return X86ISD::WrapperRIP; | |||
| 20898 | ||||
| 20899 | // GOTPCREL references must always use RIP. | |||
| 20900 | if (OpFlags == X86II::MO_GOTPCREL || OpFlags == X86II::MO_GOTPCREL_NORELAX) | |||
| 20901 | return X86ISD::WrapperRIP; | |||
| 20902 | ||||
| 20903 | return X86ISD::Wrapper; | |||
| 20904 | } | |||
| 20905 | ||||
| 20906 | // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as | |||
| 20907 | // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is | |||
| 20908 | // one of the above mentioned nodes. It has to be wrapped because otherwise | |||
| 20909 | // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only | |||
| 20910 | // be used to form addressing mode. These wrapped nodes will be selected | |||
| 20911 | // into MOV32ri. | |||
| 20912 | SDValue | |||
| 20913 | X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { | |||
| 20914 | ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op); | |||
| 20915 | ||||
| 20916 | // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the | |||
| 20917 | // global base reg. | |||
| 20918 | unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); | |||
| 20919 | ||||
| 20920 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 20921 | SDValue Result = DAG.getTargetConstantPool( | |||
| 20922 | CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag); | |||
| 20923 | SDLoc DL(CP); | |||
| 20924 | Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); | |||
| 20925 | // With PIC, the address is actually $g + Offset. | |||
| 20926 | if (OpFlag) { | |||
| 20927 | Result = | |||
| 20928 | DAG.getNode(ISD::ADD, DL, PtrVT, | |||
| 20929 | DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); | |||
| 20930 | } | |||
| 20931 | ||||
| 20932 | return Result; | |||
| 20933 | } | |||
| 20934 | ||||
| 20935 | SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const { | |||
| 20936 | JumpTableSDNode *JT = cast<JumpTableSDNode>(Op); | |||
| 20937 | ||||
| 20938 | // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the | |||
| 20939 | // global base reg. | |||
| 20940 | unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr); | |||
| 20941 | ||||
| 20942 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 20943 | SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag); | |||
| 20944 | SDLoc DL(JT); | |||
| 20945 | Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result); | |||
| 20946 | ||||
| 20947 | // With PIC, the address is actually $g + Offset. | |||
| 20948 | if (OpFlag) | |||
| 20949 | Result = | |||
| 20950 | DAG.getNode(ISD::ADD, DL, PtrVT, | |||
| 20951 | DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result); | |||
| 20952 | ||||
| 20953 | return Result; | |||
| 20954 | } | |||
| 20955 | ||||
| 20956 | SDValue X86TargetLowering::LowerExternalSymbol(SDValue Op, | |||
| 20957 | SelectionDAG &DAG) const { | |||
| 20958 | return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); | |||
| 20959 | } | |||
| 20960 | ||||
| 20961 | SDValue | |||
| 20962 | X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const { | |||
| 20963 | // Create the TargetBlockAddressAddress node. | |||
| 20964 | unsigned char OpFlags = | |||
| 20965 | Subtarget.classifyBlockAddressReference(); | |||
| 20966 | const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress(); | |||
| 20967 | int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset(); | |||
| 20968 | SDLoc dl(Op); | |||
| 20969 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 20970 | SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags); | |||
| 20971 | Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result); | |||
| 20972 | ||||
| 20973 | // With PIC, the address is actually $g + Offset. | |||
| 20974 | if (isGlobalRelativeToPICBase(OpFlags)) { | |||
| 20975 | Result = DAG.getNode(ISD::ADD, dl, PtrVT, | |||
| 20976 | DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); | |||
| 20977 | } | |||
| 20978 | ||||
| 20979 | return Result; | |||
| 20980 | } | |||
| 20981 | ||||
| 20982 | /// Creates target global address or external symbol nodes for calls or | |||
| 20983 | /// other uses. | |||
| 20984 | SDValue X86TargetLowering::LowerGlobalOrExternal(SDValue Op, SelectionDAG &DAG, | |||
| 20985 | bool ForCall) const { | |||
| 20986 | // Unpack the global address or external symbol. | |||
| 20987 | const SDLoc &dl = SDLoc(Op); | |||
| 20988 | const GlobalValue *GV = nullptr; | |||
| 20989 | int64_t Offset = 0; | |||
| 20990 | const char *ExternalSym = nullptr; | |||
| 20991 | if (const auto *G = dyn_cast<GlobalAddressSDNode>(Op)) { | |||
| 20992 | GV = G->getGlobal(); | |||
| 20993 | Offset = G->getOffset(); | |||
| 20994 | } else { | |||
| 20995 | const auto *ES = cast<ExternalSymbolSDNode>(Op); | |||
| 20996 | ExternalSym = ES->getSymbol(); | |||
| 20997 | } | |||
| 20998 | ||||
| 20999 | // Calculate some flags for address lowering. | |||
| 21000 | const Module &Mod = *DAG.getMachineFunction().getFunction().getParent(); | |||
| 21001 | unsigned char OpFlags; | |||
| 21002 | if (ForCall) | |||
| 21003 | OpFlags = Subtarget.classifyGlobalFunctionReference(GV, Mod); | |||
| 21004 | else | |||
| 21005 | OpFlags = Subtarget.classifyGlobalReference(GV, Mod); | |||
| 21006 | bool HasPICReg = isGlobalRelativeToPICBase(OpFlags); | |||
| 21007 | bool NeedsLoad = isGlobalStubReference(OpFlags); | |||
| 21008 | ||||
| 21009 | CodeModel::Model M = DAG.getTarget().getCodeModel(); | |||
| 21010 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 21011 | SDValue Result; | |||
| 21012 | ||||
| 21013 | if (GV) { | |||
| 21014 | // Create a target global address if this is a global. If possible, fold the | |||
| 21015 | // offset into the global address reference. Otherwise, ADD it on later. | |||
| 21016 | // Suppress the folding if Offset is negative: movl foo-1, %eax is not | |||
| 21017 | // allowed because if the address of foo is 0, the ELF R_X86_64_32 | |||
| 21018 | // relocation will compute to a negative value, which is invalid. | |||
| 21019 | int64_t GlobalOffset = 0; | |||
| 21020 | if (OpFlags == X86II::MO_NO_FLAG && Offset >= 0 && | |||
| 21021 | X86::isOffsetSuitableForCodeModel(Offset, M, true)) { | |||
| 21022 | std::swap(GlobalOffset, Offset); | |||
| 21023 | } | |||
| 21024 | Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, GlobalOffset, OpFlags); | |||
| 21025 | } else { | |||
| 21026 | // If this is not a global address, this must be an external symbol. | |||
| 21027 | Result = DAG.getTargetExternalSymbol(ExternalSym, PtrVT, OpFlags); | |||
| 21028 | } | |||
| 21029 | ||||
| 21030 | // If this is a direct call, avoid the wrapper if we don't need to do any | |||
| 21031 | // loads or adds. This allows SDAG ISel to match direct calls. | |||
| 21032 | if (ForCall && !NeedsLoad && !HasPICReg && Offset == 0) | |||
| 21033 | return Result; | |||
| 21034 | ||||
| 21035 | Result = DAG.getNode(getGlobalWrapperKind(GV, OpFlags), dl, PtrVT, Result); | |||
| 21036 | ||||
| 21037 | // With PIC, the address is actually $g + Offset. | |||
| 21038 | if (HasPICReg) { | |||
| 21039 | Result = DAG.getNode(ISD::ADD, dl, PtrVT, | |||
| 21040 | DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result); | |||
| 21041 | } | |||
| 21042 | ||||
| 21043 | // For globals that require a load from a stub to get the address, emit the | |||
| 21044 | // load. | |||
| 21045 | if (NeedsLoad) | |||
| 21046 | Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result, | |||
| 21047 | MachinePointerInfo::getGOT(DAG.getMachineFunction())); | |||
| 21048 | ||||
| 21049 | // If there was a non-zero offset that we didn't fold, create an explicit | |||
| 21050 | // addition for it. | |||
| 21051 | if (Offset != 0) | |||
| 21052 | Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result, | |||
| 21053 | DAG.getConstant(Offset, dl, PtrVT)); | |||
| 21054 | ||||
| 21055 | return Result; | |||
| 21056 | } | |||
| 21057 | ||||
| 21058 | SDValue | |||
| 21059 | X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const { | |||
| 21060 | return LowerGlobalOrExternal(Op, DAG, /*ForCall=*/false); | |||
| 21061 | } | |||
| 21062 | ||||
| 21063 | static SDValue | |||
| 21064 | GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA, | |||
| 21065 | SDValue *InGlue, const EVT PtrVT, unsigned ReturnReg, | |||
| 21066 | unsigned char OperandFlags, bool LocalDynamic = false) { | |||
| 21067 | MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); | |||
| 21068 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 21069 | SDLoc dl(GA); | |||
| 21070 | SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, | |||
| 21071 | GA->getValueType(0), | |||
| 21072 | GA->getOffset(), | |||
| 21073 | OperandFlags); | |||
| 21074 | ||||
| 21075 | X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR | |||
| 21076 | : X86ISD::TLSADDR; | |||
| 21077 | ||||
| 21078 | if (InGlue) { | |||
| 21079 | SDValue Ops[] = { Chain, TGA, *InGlue }; | |||
| 21080 | Chain = DAG.getNode(CallType, dl, NodeTys, Ops); | |||
| 21081 | } else { | |||
| 21082 | SDValue Ops[] = { Chain, TGA }; | |||
| 21083 | Chain = DAG.getNode(CallType, dl, NodeTys, Ops); | |||
| 21084 | } | |||
| 21085 | ||||
| 21086 | // TLSADDR will be codegen'ed as call. Inform MFI that function has calls. | |||
| 21087 | MFI.setAdjustsStack(true); | |||
| 21088 | MFI.setHasCalls(true); | |||
| 21089 | ||||
| 21090 | SDValue Glue = Chain.getValue(1); | |||
| 21091 | return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue); | |||
| 21092 | } | |||
| 21093 | ||||
| 21094 | // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit | |||
| 21095 | static SDValue | |||
| 21096 | LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG, | |||
| 21097 | const EVT PtrVT) { | |||
| 21098 | SDValue InGlue; | |||
| 21099 | SDLoc dl(GA); // ? function entry point might be better | |||
| 21100 | SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, | |||
| 21101 | DAG.getNode(X86ISD::GlobalBaseReg, | |||
| 21102 | SDLoc(), PtrVT), InGlue); | |||
| 21103 | InGlue = Chain.getValue(1); | |||
| 21104 | ||||
| 21105 | return GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, X86II::MO_TLSGD); | |||
| 21106 | } | |||
| 21107 | ||||
| 21108 | // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit LP64 | |||
| 21109 | static SDValue | |||
| 21110 | LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG, | |||
| 21111 | const EVT PtrVT) { | |||
| 21112 | return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, | |||
| 21113 | X86::RAX, X86II::MO_TLSGD); | |||
| 21114 | } | |||
| 21115 | ||||
| 21116 | // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit ILP32 | |||
| 21117 | static SDValue | |||
| 21118 | LowerToTLSGeneralDynamicModelX32(GlobalAddressSDNode *GA, SelectionDAG &DAG, | |||
| 21119 | const EVT PtrVT) { | |||
| 21120 | return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, | |||
| 21121 | X86::EAX, X86II::MO_TLSGD); | |||
| 21122 | } | |||
| 21123 | ||||
| 21124 | static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA, | |||
| 21125 | SelectionDAG &DAG, const EVT PtrVT, | |||
| 21126 | bool Is64Bit, bool Is64BitLP64) { | |||
| 21127 | SDLoc dl(GA); | |||
| 21128 | ||||
| 21129 | // Get the start address of the TLS block for this module. | |||
| 21130 | X86MachineFunctionInfo *MFI = DAG.getMachineFunction() | |||
| 21131 | .getInfo<X86MachineFunctionInfo>(); | |||
| 21132 | MFI->incNumLocalDynamicTLSAccesses(); | |||
| 21133 | ||||
| 21134 | SDValue Base; | |||
| 21135 | if (Is64Bit) { | |||
| 21136 | unsigned ReturnReg = Is64BitLP64 ? X86::RAX : X86::EAX; | |||
| 21137 | Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, ReturnReg, | |||
| 21138 | X86II::MO_TLSLD, /*LocalDynamic=*/true); | |||
| 21139 | } else { | |||
| 21140 | SDValue InGlue; | |||
| 21141 | SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX, | |||
| 21142 | DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InGlue); | |||
| 21143 | InGlue = Chain.getValue(1); | |||
| 21144 | Base = GetTLSADDR(DAG, Chain, GA, &InGlue, PtrVT, X86::EAX, | |||
| 21145 | X86II::MO_TLSLDM, /*LocalDynamic=*/true); | |||
| 21146 | } | |||
| 21147 | ||||
| 21148 | // Note: the CleanupLocalDynamicTLSPass will remove redundant computations | |||
| 21149 | // of Base. | |||
| 21150 | ||||
| 21151 | // Build x@dtpoff. | |||
| 21152 | unsigned char OperandFlags = X86II::MO_DTPOFF; | |||
| 21153 | unsigned WrapperKind = X86ISD::Wrapper; | |||
| 21154 | SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, | |||
| 21155 | GA->getValueType(0), | |||
| 21156 | GA->getOffset(), OperandFlags); | |||
| 21157 | SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); | |||
| 21158 | ||||
| 21159 | // Add x@dtpoff with the base. | |||
| 21160 | return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base); | |||
| 21161 | } | |||
| 21162 | ||||
| 21163 | // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model. | |||
| 21164 | static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG, | |||
| 21165 | const EVT PtrVT, TLSModel::Model model, | |||
| 21166 | bool is64Bit, bool isPIC) { | |||
| 21167 | SDLoc dl(GA); | |||
| 21168 | ||||
| 21169 | // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). | |||
| 21170 | Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(), | |||
| 21171 | is64Bit ? 257 : 256)); | |||
| 21172 | ||||
| 21173 | SDValue ThreadPointer = | |||
| 21174 | DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl), | |||
| 21175 | MachinePointerInfo(Ptr)); | |||
| 21176 | ||||
| 21177 | unsigned char OperandFlags = 0; | |||
| 21178 | // Most TLS accesses are not RIP relative, even on x86-64. One exception is | |||
| 21179 | // initialexec. | |||
| 21180 | unsigned WrapperKind = X86ISD::Wrapper; | |||
| 21181 | if (model == TLSModel::LocalExec) { | |||
| 21182 | OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF; | |||
| 21183 | } else if (model == TLSModel::InitialExec) { | |||
| 21184 | if (is64Bit) { | |||
| 21185 | OperandFlags = X86II::MO_GOTTPOFF; | |||
| 21186 | WrapperKind = X86ISD::WrapperRIP; | |||
| 21187 | } else { | |||
| 21188 | OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF; | |||
| 21189 | } | |||
| 21190 | } else { | |||
| 21191 | llvm_unreachable("Unexpected model")::llvm::llvm_unreachable_internal("Unexpected model", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 21191); | |||
| 21192 | } | |||
| 21193 | ||||
| 21194 | // emit "addl x@ntpoff,%eax" (local exec) | |||
| 21195 | // or "addl x@indntpoff,%eax" (initial exec) | |||
| 21196 | // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic) | |||
| 21197 | SDValue TGA = | |||
| 21198 | DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0), | |||
| 21199 | GA->getOffset(), OperandFlags); | |||
| 21200 | SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA); | |||
| 21201 | ||||
| 21202 | if (model == TLSModel::InitialExec) { | |||
| 21203 | if (isPIC && !is64Bit) { | |||
| 21204 | Offset = DAG.getNode(ISD::ADD, dl, PtrVT, | |||
| 21205 | DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), | |||
| 21206 | Offset); | |||
| 21207 | } | |||
| 21208 | ||||
| 21209 | Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset, | |||
| 21210 | MachinePointerInfo::getGOT(DAG.getMachineFunction())); | |||
| 21211 | } | |||
| 21212 | ||||
| 21213 | // The address of the thread local variable is the add of the thread | |||
| 21214 | // pointer with the offset of the variable. | |||
| 21215 | return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset); | |||
| 21216 | } | |||
| 21217 | ||||
| 21218 | SDValue | |||
| 21219 | X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { | |||
| 21220 | ||||
| 21221 | GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); | |||
| 21222 | ||||
| 21223 | if (DAG.getTarget().useEmulatedTLS()) | |||
| 21224 | return LowerToTLSEmulatedModel(GA, DAG); | |||
| 21225 | ||||
| 21226 | const GlobalValue *GV = GA->getGlobal(); | |||
| 21227 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 21228 | bool PositionIndependent = isPositionIndependent(); | |||
| 21229 | ||||
| 21230 | if (Subtarget.isTargetELF()) { | |||
| 21231 | TLSModel::Model model = DAG.getTarget().getTLSModel(GV); | |||
| 21232 | switch (model) { | |||
| 21233 | case TLSModel::GeneralDynamic: | |||
| 21234 | if (Subtarget.is64Bit()) { | |||
| 21235 | if (Subtarget.isTarget64BitLP64()) | |||
| 21236 | return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT); | |||
| 21237 | return LowerToTLSGeneralDynamicModelX32(GA, DAG, PtrVT); | |||
| 21238 | } | |||
| 21239 | return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT); | |||
| 21240 | case TLSModel::LocalDynamic: | |||
| 21241 | return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT, Subtarget.is64Bit(), | |||
| 21242 | Subtarget.isTarget64BitLP64()); | |||
| 21243 | case TLSModel::InitialExec: | |||
| 21244 | case TLSModel::LocalExec: | |||
| 21245 | return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(), | |||
| 21246 | PositionIndependent); | |||
| 21247 | } | |||
| 21248 | llvm_unreachable("Unknown TLS model.")::llvm::llvm_unreachable_internal("Unknown TLS model.", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 21248); | |||
| 21249 | } | |||
| 21250 | ||||
| 21251 | if (Subtarget.isTargetDarwin()) { | |||
| 21252 | // Darwin only has one model of TLS. Lower to that. | |||
| 21253 | unsigned char OpFlag = 0; | |||
| 21254 | unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ? | |||
| 21255 | X86ISD::WrapperRIP : X86ISD::Wrapper; | |||
| 21256 | ||||
| 21257 | // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the | |||
| 21258 | // global base reg. | |||
| 21259 | bool PIC32 = PositionIndependent && !Subtarget.is64Bit(); | |||
| 21260 | if (PIC32) | |||
| 21261 | OpFlag = X86II::MO_TLVP_PIC_BASE; | |||
| 21262 | else | |||
| 21263 | OpFlag = X86II::MO_TLVP; | |||
| 21264 | SDLoc DL(Op); | |||
| 21265 | SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL, | |||
| 21266 | GA->getValueType(0), | |||
| 21267 | GA->getOffset(), OpFlag); | |||
| 21268 | SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result); | |||
| 21269 | ||||
| 21270 | // With PIC32, the address is actually $g + Offset. | |||
| 21271 | if (PIC32) | |||
| 21272 | Offset = DAG.getNode(ISD::ADD, DL, PtrVT, | |||
| 21273 | DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), | |||
| 21274 | Offset); | |||
| 21275 | ||||
| 21276 | // Lowering the machine isd will make sure everything is in the right | |||
| 21277 | // location. | |||
| 21278 | SDValue Chain = DAG.getEntryNode(); | |||
| 21279 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 21280 | Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); | |||
| 21281 | SDValue Args[] = { Chain, Offset }; | |||
| 21282 | Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); | |||
| 21283 | Chain = DAG.getCALLSEQ_END(Chain, 0, 0, Chain.getValue(1), DL); | |||
| 21284 | ||||
| 21285 | // TLSCALL will be codegen'ed as call. Inform MFI that function has calls. | |||
| 21286 | MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); | |||
| 21287 | MFI.setAdjustsStack(true); | |||
| 21288 | ||||
| 21289 | // And our return value (tls address) is in the standard call return value | |||
| 21290 | // location. | |||
| 21291 | unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX; | |||
| 21292 | return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1)); | |||
| 21293 | } | |||
| 21294 | ||||
| 21295 | if (Subtarget.isOSWindows()) { | |||
| 21296 | // Just use the implicit TLS architecture | |||
| 21297 | // Need to generate something similar to: | |||
| 21298 | // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage | |||
| 21299 | // ; from TEB | |||
| 21300 | // mov ecx, dword [rel _tls_index]: Load index (from C runtime) | |||
| 21301 | // mov rcx, qword [rdx+rcx*8] | |||
| 21302 | // mov eax, .tls$:tlsvar | |||
| 21303 | // [rax+rcx] contains the address | |||
| 21304 | // Windows 64bit: gs:0x58 | |||
| 21305 | // Windows 32bit: fs:__tls_array | |||
| 21306 | ||||
| 21307 | SDLoc dl(GA); | |||
| 21308 | SDValue Chain = DAG.getEntryNode(); | |||
| 21309 | ||||
| 21310 | // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or | |||
| 21311 | // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly | |||
| 21312 | // use its literal value of 0x2C. | |||
| 21313 | Value *Ptr = Constant::getNullValue(Subtarget.is64Bit() | |||
| 21314 | ? Type::getInt8PtrTy(*DAG.getContext(), | |||
| 21315 | 256) | |||
| 21316 | : Type::getInt32PtrTy(*DAG.getContext(), | |||
| 21317 | 257)); | |||
| 21318 | ||||
| 21319 | SDValue TlsArray = Subtarget.is64Bit() | |||
| 21320 | ? DAG.getIntPtrConstant(0x58, dl) | |||
| 21321 | : (Subtarget.isTargetWindowsGNU() | |||
| 21322 | ? DAG.getIntPtrConstant(0x2C, dl) | |||
| 21323 | : DAG.getExternalSymbol("_tls_array", PtrVT)); | |||
| 21324 | ||||
| 21325 | SDValue ThreadPointer = | |||
| 21326 | DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr)); | |||
| 21327 | ||||
| 21328 | SDValue res; | |||
| 21329 | if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) { | |||
| 21330 | res = ThreadPointer; | |||
| 21331 | } else { | |||
| 21332 | // Load the _tls_index variable | |||
| 21333 | SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT); | |||
| 21334 | if (Subtarget.is64Bit()) | |||
| 21335 | IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX, | |||
| 21336 | MachinePointerInfo(), MVT::i32); | |||
| 21337 | else | |||
| 21338 | IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo()); | |||
| 21339 | ||||
| 21340 | const DataLayout &DL = DAG.getDataLayout(); | |||
| 21341 | SDValue Scale = | |||
| 21342 | DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, MVT::i8); | |||
| 21343 | IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale); | |||
| 21344 | ||||
| 21345 | res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX); | |||
| 21346 | } | |||
| 21347 | ||||
| 21348 | res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo()); | |||
| 21349 | ||||
| 21350 | // Get the offset of start of .tls section | |||
| 21351 | SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, | |||
| 21352 | GA->getValueType(0), | |||
| 21353 | GA->getOffset(), X86II::MO_SECREL); | |||
| 21354 | SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA); | |||
| 21355 | ||||
| 21356 | // The address of the thread local variable is the add of the thread | |||
| 21357 | // pointer with the offset of the variable. | |||
| 21358 | return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset); | |||
| 21359 | } | |||
| 21360 | ||||
| 21361 | llvm_unreachable("TLS not implemented for this target.")::llvm::llvm_unreachable_internal("TLS not implemented for this target." , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21361); | |||
| 21362 | } | |||
| 21363 | ||||
| 21364 | /// Lower SRA_PARTS and friends, which return two i32 values | |||
| 21365 | /// and take a 2 x i32 value to shift plus a shift amount. | |||
| 21366 | /// TODO: Can this be moved to general expansion code? | |||
| 21367 | static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) { | |||
| 21368 | SDValue Lo, Hi; | |||
| 21369 | DAG.getTargetLoweringInfo().expandShiftParts(Op.getNode(), Lo, Hi, DAG); | |||
| 21370 | return DAG.getMergeValues({Lo, Hi}, SDLoc(Op)); | |||
| 21371 | } | |||
| 21372 | ||||
| 21373 | // Try to use a packed vector operation to handle i64 on 32-bit targets when | |||
| 21374 | // AVX512DQ is enabled. | |||
| 21375 | static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG, | |||
| 21376 | const X86Subtarget &Subtarget) { | |||
| 21377 | assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__ __PRETTY_FUNCTION__)) | |||
| 21378 | Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__ __PRETTY_FUNCTION__)) | |||
| 21379 | Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__ __PRETTY_FUNCTION__)) | |||
| 21380 | Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__ __PRETTY_FUNCTION__)) | |||
| 21381 | "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21381, __extension__ __PRETTY_FUNCTION__)); | |||
| 21382 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21383 | unsigned OpNo = IsStrict ? 1 : 0; | |||
| 21384 | SDValue Src = Op.getOperand(OpNo); | |||
| 21385 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 21386 | MVT VT = Op.getSimpleValueType(); | |||
| 21387 | ||||
| 21388 | if (!Subtarget.hasDQI() || SrcVT != MVT::i64 || Subtarget.is64Bit() || | |||
| 21389 | (VT != MVT::f32 && VT != MVT::f64)) | |||
| 21390 | return SDValue(); | |||
| 21391 | ||||
| 21392 | // Pack the i64 into a vector, do the operation and extract. | |||
| 21393 | ||||
| 21394 | // Using 256-bit to ensure result is 128-bits for f32 case. | |||
| 21395 | unsigned NumElts = Subtarget.hasVLX() ? 4 : 8; | |||
| 21396 | MVT VecInVT = MVT::getVectorVT(MVT::i64, NumElts); | |||
| 21397 | MVT VecVT = MVT::getVectorVT(VT, NumElts); | |||
| 21398 | ||||
| 21399 | SDLoc dl(Op); | |||
| 21400 | SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src); | |||
| 21401 | if (IsStrict) { | |||
| 21402 | SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other}, | |||
| 21403 | {Op.getOperand(0), InVec}); | |||
| 21404 | SDValue Chain = CvtVec.getValue(1); | |||
| 21405 | SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, | |||
| 21406 | DAG.getIntPtrConstant(0, dl)); | |||
| 21407 | return DAG.getMergeValues({Value, Chain}, dl); | |||
| 21408 | } | |||
| 21409 | ||||
| 21410 | SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec); | |||
| 21411 | ||||
| 21412 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, | |||
| 21413 | DAG.getIntPtrConstant(0, dl)); | |||
| 21414 | } | |||
| 21415 | ||||
| 21416 | // Try to use a packed vector operation to handle i64 on 32-bit targets. | |||
| 21417 | static SDValue LowerI64IntToFP16(SDValue Op, SelectionDAG &DAG, | |||
| 21418 | const X86Subtarget &Subtarget) { | |||
| 21419 | assert((Op.getOpcode() == ISD::SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__ __PRETTY_FUNCTION__)) | |||
| 21420 | Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__ __PRETTY_FUNCTION__)) | |||
| 21421 | Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__ __PRETTY_FUNCTION__)) | |||
| 21422 | Op.getOpcode() == ISD::UINT_TO_FP) &&(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__ __PRETTY_FUNCTION__)) | |||
| 21423 | "Unexpected opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode( ) == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP ) && "Unexpected opcode!") ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::SINT_TO_FP || Op.getOpcode() == ISD::STRICT_SINT_TO_FP || Op.getOpcode() == ISD::STRICT_UINT_TO_FP || Op.getOpcode() == ISD::UINT_TO_FP) && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21423, __extension__ __PRETTY_FUNCTION__)); | |||
| 21424 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21425 | SDValue Src = Op.getOperand(IsStrict ? 1 : 0); | |||
| 21426 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 21427 | MVT VT = Op.getSimpleValueType(); | |||
| 21428 | ||||
| 21429 | if (SrcVT != MVT::i64 || Subtarget.is64Bit() || VT != MVT::f16) | |||
| 21430 | return SDValue(); | |||
| 21431 | ||||
| 21432 | // Pack the i64 into a vector, do the operation and extract. | |||
| 21433 | ||||
| 21434 | assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16" ) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21434, __extension__ __PRETTY_FUNCTION__)); | |||
| 21435 | ||||
| 21436 | SDLoc dl(Op); | |||
| 21437 | SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); | |||
| 21438 | if (IsStrict) { | |||
| 21439 | SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {MVT::v2f16, MVT::Other}, | |||
| 21440 | {Op.getOperand(0), InVec}); | |||
| 21441 | SDValue Chain = CvtVec.getValue(1); | |||
| 21442 | SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, | |||
| 21443 | DAG.getIntPtrConstant(0, dl)); | |||
| 21444 | return DAG.getMergeValues({Value, Chain}, dl); | |||
| 21445 | } | |||
| 21446 | ||||
| 21447 | SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, MVT::v2f16, InVec); | |||
| 21448 | ||||
| 21449 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec, | |||
| 21450 | DAG.getIntPtrConstant(0, dl)); | |||
| 21451 | } | |||
| 21452 | ||||
| 21453 | static bool useVectorCast(unsigned Opcode, MVT FromVT, MVT ToVT, | |||
| 21454 | const X86Subtarget &Subtarget) { | |||
| 21455 | switch (Opcode) { | |||
| 21456 | case ISD::SINT_TO_FP: | |||
| 21457 | // TODO: Handle wider types with AVX/AVX512. | |||
| 21458 | if (!Subtarget.hasSSE2() || FromVT != MVT::v4i32) | |||
| 21459 | return false; | |||
| 21460 | // CVTDQ2PS or (V)CVTDQ2PD | |||
| 21461 | return ToVT == MVT::v4f32 || (Subtarget.hasAVX() && ToVT == MVT::v4f64); | |||
| 21462 | ||||
| 21463 | case ISD::UINT_TO_FP: | |||
| 21464 | // TODO: Handle wider types and i64 elements. | |||
| 21465 | if (!Subtarget.hasAVX512() || FromVT != MVT::v4i32) | |||
| 21466 | return false; | |||
| 21467 | // VCVTUDQ2PS or VCVTUDQ2PD | |||
| 21468 | return ToVT == MVT::v4f32 || ToVT == MVT::v4f64; | |||
| 21469 | ||||
| 21470 | default: | |||
| 21471 | return false; | |||
| 21472 | } | |||
| 21473 | } | |||
| 21474 | ||||
| 21475 | /// Given a scalar cast operation that is extracted from a vector, try to | |||
| 21476 | /// vectorize the cast op followed by extraction. This will avoid an expensive | |||
| 21477 | /// round-trip between XMM and GPR. | |||
| 21478 | static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG, | |||
| 21479 | const X86Subtarget &Subtarget) { | |||
| 21480 | // TODO: This could be enhanced to handle smaller integer types by peeking | |||
| 21481 | // through an extend. | |||
| 21482 | SDValue Extract = Cast.getOperand(0); | |||
| 21483 | MVT DestVT = Cast.getSimpleValueType(); | |||
| 21484 | if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 21485 | !isa<ConstantSDNode>(Extract.getOperand(1))) | |||
| 21486 | return SDValue(); | |||
| 21487 | ||||
| 21488 | // See if we have a 128-bit vector cast op for this type of cast. | |||
| 21489 | SDValue VecOp = Extract.getOperand(0); | |||
| 21490 | MVT FromVT = VecOp.getSimpleValueType(); | |||
| 21491 | unsigned NumEltsInXMM = 128 / FromVT.getScalarSizeInBits(); | |||
| 21492 | MVT Vec128VT = MVT::getVectorVT(FromVT.getScalarType(), NumEltsInXMM); | |||
| 21493 | MVT ToVT = MVT::getVectorVT(DestVT, NumEltsInXMM); | |||
| 21494 | if (!useVectorCast(Cast.getOpcode(), Vec128VT, ToVT, Subtarget)) | |||
| 21495 | return SDValue(); | |||
| 21496 | ||||
| 21497 | // If we are extracting from a non-zero element, first shuffle the source | |||
| 21498 | // vector to allow extracting from element zero. | |||
| 21499 | SDLoc DL(Cast); | |||
| 21500 | if (!isNullConstant(Extract.getOperand(1))) { | |||
| 21501 | SmallVector<int, 16> Mask(FromVT.getVectorNumElements(), -1); | |||
| 21502 | Mask[0] = Extract.getConstantOperandVal(1); | |||
| 21503 | VecOp = DAG.getVectorShuffle(FromVT, DL, VecOp, DAG.getUNDEF(FromVT), Mask); | |||
| 21504 | } | |||
| 21505 | // If the source vector is wider than 128-bits, extract the low part. Do not | |||
| 21506 | // create an unnecessarily wide vector cast op. | |||
| 21507 | if (FromVT != Vec128VT) | |||
| 21508 | VecOp = extract128BitVector(VecOp, 0, DAG, DL); | |||
| 21509 | ||||
| 21510 | // cast (extelt V, 0) --> extelt (cast (extract_subv V)), 0 | |||
| 21511 | // cast (extelt V, C) --> extelt (cast (extract_subv (shuffle V, [C...]))), 0 | |||
| 21512 | SDValue VCast = DAG.getNode(Cast.getOpcode(), DL, ToVT, VecOp); | |||
| 21513 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, DestVT, VCast, | |||
| 21514 | DAG.getIntPtrConstant(0, DL)); | |||
| 21515 | } | |||
| 21516 | ||||
| 21517 | /// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc), | |||
| 21518 | /// try to vectorize the cast ops. This will avoid an expensive round-trip | |||
| 21519 | /// between XMM and GPR. | |||
| 21520 | static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG, | |||
| 21521 | const X86Subtarget &Subtarget) { | |||
| 21522 | // TODO: Allow FP_TO_UINT. | |||
| 21523 | SDValue CastToInt = CastToFP.getOperand(0); | |||
| 21524 | MVT VT = CastToFP.getSimpleValueType(); | |||
| 21525 | if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector()) | |||
| 21526 | return SDValue(); | |||
| 21527 | ||||
| 21528 | MVT IntVT = CastToInt.getSimpleValueType(); | |||
| 21529 | SDValue X = CastToInt.getOperand(0); | |||
| 21530 | MVT SrcVT = X.getSimpleValueType(); | |||
| 21531 | if (SrcVT != MVT::f32 && SrcVT != MVT::f64) | |||
| 21532 | return SDValue(); | |||
| 21533 | ||||
| 21534 | // See if we have 128-bit vector cast instructions for this type of cast. | |||
| 21535 | // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd. | |||
| 21536 | if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) || | |||
| 21537 | IntVT != MVT::i32) | |||
| 21538 | return SDValue(); | |||
| 21539 | ||||
| 21540 | unsigned SrcSize = SrcVT.getSizeInBits(); | |||
| 21541 | unsigned IntSize = IntVT.getSizeInBits(); | |||
| 21542 | unsigned VTSize = VT.getSizeInBits(); | |||
| 21543 | MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize); | |||
| 21544 | MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize); | |||
| 21545 | MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize); | |||
| 21546 | ||||
| 21547 | // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64. | |||
| 21548 | unsigned ToIntOpcode = | |||
| 21549 | SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT; | |||
| 21550 | unsigned ToFPOpcode = | |||
| 21551 | IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP; | |||
| 21552 | ||||
| 21553 | // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0 | |||
| 21554 | // | |||
| 21555 | // We are not defining the high elements (for example, zero them) because | |||
| 21556 | // that could nullify any performance advantage that we hoped to gain from | |||
| 21557 | // this vector op hack. We do not expect any adverse effects (like denorm | |||
| 21558 | // penalties) with cast ops. | |||
| 21559 | SDLoc DL(CastToFP); | |||
| 21560 | SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); | |||
| 21561 | SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X); | |||
| 21562 | SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX); | |||
| 21563 | SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt); | |||
| 21564 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx); | |||
| 21565 | } | |||
| 21566 | ||||
| 21567 | static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG, | |||
| 21568 | const X86Subtarget &Subtarget) { | |||
| 21569 | SDLoc DL(Op); | |||
| 21570 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21571 | MVT VT = Op->getSimpleValueType(0); | |||
| 21572 | SDValue Src = Op->getOperand(IsStrict ? 1 : 0); | |||
| 21573 | ||||
| 21574 | if (Subtarget.hasDQI()) { | |||
| 21575 | assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features" ) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21575, __extension__ __PRETTY_FUNCTION__)); | |||
| 21576 | ||||
| 21577 | assert((Src.getSimpleValueType() == MVT::v2i64 ||(static_cast <bool> ((Src.getSimpleValueType() == MVT:: v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type" ) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__ __PRETTY_FUNCTION__)) | |||
| 21578 | Src.getSimpleValueType() == MVT::v4i64) &&(static_cast <bool> ((Src.getSimpleValueType() == MVT:: v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type" ) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__ __PRETTY_FUNCTION__)) | |||
| 21579 | "Unsupported custom type")(static_cast <bool> ((Src.getSimpleValueType() == MVT:: v2i64 || Src.getSimpleValueType() == MVT::v4i64) && "Unsupported custom type" ) ? void (0) : __assert_fail ("(Src.getSimpleValueType() == MVT::v2i64 || Src.getSimpleValueType() == MVT::v4i64) && \"Unsupported custom type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21579, __extension__ __PRETTY_FUNCTION__)); | |||
| 21580 | ||||
| 21581 | // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type. | |||
| 21582 | assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && "Unexpected VT!") ? void (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__ __PRETTY_FUNCTION__)) | |||
| 21583 | "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && "Unexpected VT!") ? void (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21583, __extension__ __PRETTY_FUNCTION__)); | |||
| 21584 | MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; | |||
| 21585 | ||||
| 21586 | // Need to concat with zero vector for strict fp to avoid spurious | |||
| 21587 | // exceptions. | |||
| 21588 | SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64) | |||
| 21589 | : DAG.getUNDEF(MVT::v8i64); | |||
| 21590 | Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src, | |||
| 21591 | DAG.getIntPtrConstant(0, DL)); | |||
| 21592 | SDValue Res, Chain; | |||
| 21593 | if (IsStrict) { | |||
| 21594 | Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other}, | |||
| 21595 | {Op->getOperand(0), Src}); | |||
| 21596 | Chain = Res.getValue(1); | |||
| 21597 | } else { | |||
| 21598 | Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src); | |||
| 21599 | } | |||
| 21600 | ||||
| 21601 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 21602 | DAG.getIntPtrConstant(0, DL)); | |||
| 21603 | ||||
| 21604 | if (IsStrict) | |||
| 21605 | return DAG.getMergeValues({Res, Chain}, DL); | |||
| 21606 | return Res; | |||
| 21607 | } | |||
| 21608 | ||||
| 21609 | bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP || | |||
| 21610 | Op->getOpcode() == ISD::STRICT_SINT_TO_FP; | |||
| 21611 | if (VT != MVT::v4f32 || IsSigned) | |||
| 21612 | return SDValue(); | |||
| 21613 | ||||
| 21614 | SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64); | |||
| 21615 | SDValue One = DAG.getConstant(1, DL, MVT::v4i64); | |||
| 21616 | SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64, | |||
| 21617 | DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One), | |||
| 21618 | DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One)); | |||
| 21619 | SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT); | |||
| 21620 | SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src); | |||
| 21621 | SmallVector<SDValue, 4> SignCvts(4); | |||
| 21622 | SmallVector<SDValue, 4> Chains(4); | |||
| 21623 | for (int i = 0; i != 4; ++i) { | |||
| 21624 | SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc, | |||
| 21625 | DAG.getIntPtrConstant(i, DL)); | |||
| 21626 | if (IsStrict) { | |||
| 21627 | SignCvts[i] = | |||
| 21628 | DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other}, | |||
| 21629 | {Op.getOperand(0), Elt}); | |||
| 21630 | Chains[i] = SignCvts[i].getValue(1); | |||
| 21631 | } else { | |||
| 21632 | SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt); | |||
| 21633 | } | |||
| 21634 | } | |||
| 21635 | SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts); | |||
| 21636 | ||||
| 21637 | SDValue Slow, Chain; | |||
| 21638 | if (IsStrict) { | |||
| 21639 | Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); | |||
| 21640 | Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other}, | |||
| 21641 | {Chain, SignCvt, SignCvt}); | |||
| 21642 | Chain = Slow.getValue(1); | |||
| 21643 | } else { | |||
| 21644 | Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt); | |||
| 21645 | } | |||
| 21646 | ||||
| 21647 | IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg); | |||
| 21648 | SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt); | |||
| 21649 | ||||
| 21650 | if (IsStrict) | |||
| 21651 | return DAG.getMergeValues({Cvt, Chain}, DL); | |||
| 21652 | ||||
| 21653 | return Cvt; | |||
| 21654 | } | |||
| 21655 | ||||
| 21656 | static SDValue promoteXINT_TO_FP(SDValue Op, SelectionDAG &DAG) { | |||
| 21657 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21658 | SDValue Src = Op.getOperand(IsStrict ? 1 : 0); | |||
| 21659 | SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); | |||
| 21660 | MVT VT = Op.getSimpleValueType(); | |||
| 21661 | MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; | |||
| 21662 | SDLoc dl(Op); | |||
| 21663 | ||||
| 21664 | SDValue Rnd = DAG.getIntPtrConstant(0, dl); | |||
| 21665 | if (IsStrict) | |||
| 21666 | return DAG.getNode( | |||
| 21667 | ISD::STRICT_FP_ROUND, dl, {VT, MVT::Other}, | |||
| 21668 | {Chain, | |||
| 21669 | DAG.getNode(Op.getOpcode(), dl, {NVT, MVT::Other}, {Chain, Src}), | |||
| 21670 | Rnd}); | |||
| 21671 | return DAG.getNode(ISD::FP_ROUND, dl, VT, | |||
| 21672 | DAG.getNode(Op.getOpcode(), dl, NVT, Src), Rnd); | |||
| 21673 | } | |||
| 21674 | ||||
| 21675 | static bool isLegalConversion(MVT VT, bool IsSigned, | |||
| 21676 | const X86Subtarget &Subtarget) { | |||
| 21677 | if (VT == MVT::v4i32 && Subtarget.hasSSE2() && IsSigned) | |||
| 21678 | return true; | |||
| 21679 | if (VT == MVT::v8i32 && Subtarget.hasAVX() && IsSigned) | |||
| 21680 | return true; | |||
| 21681 | if (Subtarget.hasVLX() && (VT == MVT::v4i32 || VT == MVT::v8i32)) | |||
| 21682 | return true; | |||
| 21683 | if (Subtarget.useAVX512Regs()) { | |||
| 21684 | if (VT == MVT::v16i32) | |||
| 21685 | return true; | |||
| 21686 | if (VT == MVT::v8i64 && Subtarget.hasDQI()) | |||
| 21687 | return true; | |||
| 21688 | } | |||
| 21689 | if (Subtarget.hasDQI() && Subtarget.hasVLX() && | |||
| 21690 | (VT == MVT::v2i64 || VT == MVT::v4i64)) | |||
| 21691 | return true; | |||
| 21692 | return false; | |||
| 21693 | } | |||
| 21694 | ||||
| 21695 | SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op, | |||
| 21696 | SelectionDAG &DAG) const { | |||
| 21697 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21698 | unsigned OpNo = IsStrict ? 1 : 0; | |||
| 21699 | SDValue Src = Op.getOperand(OpNo); | |||
| 21700 | SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode(); | |||
| 21701 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 21702 | MVT VT = Op.getSimpleValueType(); | |||
| 21703 | SDLoc dl(Op); | |||
| 21704 | ||||
| 21705 | if (isSoftFP16(VT)) | |||
| 21706 | return promoteXINT_TO_FP(Op, DAG); | |||
| 21707 | else if (isLegalConversion(SrcVT, true, Subtarget)) | |||
| 21708 | return Op; | |||
| 21709 | ||||
| 21710 | if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) | |||
| 21711 | return LowerWin64_INT128_TO_FP(Op, DAG); | |||
| 21712 | ||||
| 21713 | if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) | |||
| 21714 | return Extract; | |||
| 21715 | ||||
| 21716 | if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget)) | |||
| 21717 | return R; | |||
| 21718 | ||||
| 21719 | if (SrcVT.isVector()) { | |||
| 21720 | if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) { | |||
| 21721 | // Note: Since v2f64 is a legal type. We don't need to zero extend the | |||
| 21722 | // source for strict FP. | |||
| 21723 | if (IsStrict) | |||
| 21724 | return DAG.getNode( | |||
| 21725 | X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, | |||
| 21726 | {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, | |||
| 21727 | DAG.getUNDEF(SrcVT))}); | |||
| 21728 | return DAG.getNode(X86ISD::CVTSI2P, dl, VT, | |||
| 21729 | DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, | |||
| 21730 | DAG.getUNDEF(SrcVT))); | |||
| 21731 | } | |||
| 21732 | if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64) | |||
| 21733 | return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); | |||
| 21734 | ||||
| 21735 | return SDValue(); | |||
| 21736 | } | |||
| 21737 | ||||
| 21738 | assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void (0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__ __PRETTY_FUNCTION__)) | |||
| 21739 | "Unknown SINT_TO_FP to lower!")(static_cast <bool> (SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && "Unknown SINT_TO_FP to lower!") ? void (0) : __assert_fail ("SrcVT <= MVT::i64 && SrcVT >= MVT::i16 && \"Unknown SINT_TO_FP to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21739, __extension__ __PRETTY_FUNCTION__)); | |||
| 21740 | ||||
| 21741 | bool UseSSEReg = isScalarFPTypeInSSEReg(VT); | |||
| 21742 | ||||
| 21743 | // These are really Legal; return the operand so the caller accepts it as | |||
| 21744 | // Legal. | |||
| 21745 | if (SrcVT == MVT::i32 && UseSSEReg) | |||
| 21746 | return Op; | |||
| 21747 | if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit()) | |||
| 21748 | return Op; | |||
| 21749 | ||||
| 21750 | if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) | |||
| 21751 | return V; | |||
| 21752 | if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget)) | |||
| 21753 | return V; | |||
| 21754 | ||||
| 21755 | // SSE doesn't have an i16 conversion so we need to promote. | |||
| 21756 | if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) { | |||
| 21757 | SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src); | |||
| 21758 | if (IsStrict) | |||
| 21759 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 21760 | {Chain, Ext}); | |||
| 21761 | ||||
| 21762 | return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext); | |||
| 21763 | } | |||
| 21764 | ||||
| 21765 | if (VT == MVT::f128 || !Subtarget.hasX87()) | |||
| 21766 | return SDValue(); | |||
| 21767 | ||||
| 21768 | SDValue ValueToStore = Src; | |||
| 21769 | if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit()) | |||
| 21770 | // Bitcasting to f64 here allows us to do a single 64-bit store from | |||
| 21771 | // an SSE register, avoiding the store forwarding penalty that would come | |||
| 21772 | // with two 32-bit stores. | |||
| 21773 | ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); | |||
| 21774 | ||||
| 21775 | unsigned Size = SrcVT.getStoreSize(); | |||
| 21776 | Align Alignment(Size); | |||
| 21777 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 21778 | auto PtrVT = getPointerTy(MF.getDataLayout()); | |||
| 21779 | int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false); | |||
| 21780 | MachinePointerInfo MPI = | |||
| 21781 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); | |||
| 21782 | SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); | |||
| 21783 | Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment); | |||
| 21784 | std::pair<SDValue, SDValue> Tmp = | |||
| 21785 | BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG); | |||
| 21786 | ||||
| 21787 | if (IsStrict) | |||
| 21788 | return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); | |||
| 21789 | ||||
| 21790 | return Tmp.first; | |||
| 21791 | } | |||
| 21792 | ||||
| 21793 | std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD( | |||
| 21794 | EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer, | |||
| 21795 | MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const { | |||
| 21796 | // Build the FILD | |||
| 21797 | SDVTList Tys; | |||
| 21798 | bool useSSE = isScalarFPTypeInSSEReg(DstVT); | |||
| 21799 | if (useSSE) | |||
| 21800 | Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 21801 | else | |||
| 21802 | Tys = DAG.getVTList(DstVT, MVT::Other); | |||
| 21803 | ||||
| 21804 | SDValue FILDOps[] = {Chain, Pointer}; | |||
| 21805 | SDValue Result = | |||
| 21806 | DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo, | |||
| 21807 | Alignment, MachineMemOperand::MOLoad); | |||
| 21808 | Chain = Result.getValue(1); | |||
| 21809 | ||||
| 21810 | if (useSSE) { | |||
| 21811 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 21812 | unsigned SSFISize = DstVT.getStoreSize(); | |||
| 21813 | int SSFI = | |||
| 21814 | MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false); | |||
| 21815 | auto PtrVT = getPointerTy(MF.getDataLayout()); | |||
| 21816 | SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); | |||
| 21817 | Tys = DAG.getVTList(MVT::Other); | |||
| 21818 | SDValue FSTOps[] = {Chain, Result, StackSlot}; | |||
| 21819 | MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand( | |||
| 21820 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI), | |||
| 21821 | MachineMemOperand::MOStore, SSFISize, Align(SSFISize)); | |||
| 21822 | ||||
| 21823 | Chain = | |||
| 21824 | DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO); | |||
| 21825 | Result = DAG.getLoad( | |||
| 21826 | DstVT, DL, Chain, StackSlot, | |||
| 21827 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI)); | |||
| 21828 | Chain = Result.getValue(1); | |||
| 21829 | } | |||
| 21830 | ||||
| 21831 | return { Result, Chain }; | |||
| 21832 | } | |||
| 21833 | ||||
| 21834 | /// Horizontal vector math instructions may be slower than normal math with | |||
| 21835 | /// shuffles. Limit horizontal op codegen based on size/speed trade-offs, uarch | |||
| 21836 | /// implementation, and likely shuffle complexity of the alternate sequence. | |||
| 21837 | static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG, | |||
| 21838 | const X86Subtarget &Subtarget) { | |||
| 21839 | bool IsOptimizingSize = DAG.shouldOptForSize(); | |||
| 21840 | bool HasFastHOps = Subtarget.hasFastHorizontalOps(); | |||
| 21841 | return !IsSingleSource || IsOptimizingSize || HasFastHOps; | |||
| 21842 | } | |||
| 21843 | ||||
| 21844 | /// 64-bit unsigned integer to double expansion. | |||
| 21845 | static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG, | |||
| 21846 | const X86Subtarget &Subtarget) { | |||
| 21847 | // We can't use this algorithm for strict fp. It produces -0.0 instead of +0.0 | |||
| 21848 | // when converting 0 when rounding toward negative infinity. Caller will | |||
| 21849 | // fall back to Expand for when i64 or is legal or use FILD in 32-bit mode. | |||
| 21850 | assert(!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!")(static_cast <bool> (!Op->isStrictFPOpcode() && "Expected non-strict uint_to_fp!") ? void (0) : __assert_fail ("!Op->isStrictFPOpcode() && \"Expected non-strict uint_to_fp!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21850, __extension__ __PRETTY_FUNCTION__)); | |||
| 21851 | // This algorithm is not obvious. Here it is what we're trying to output: | |||
| 21852 | /* | |||
| 21853 | movq %rax, %xmm0 | |||
| 21854 | punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U } | |||
| 21855 | subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 } | |||
| 21856 | #ifdef __SSE3__ | |||
| 21857 | haddpd %xmm0, %xmm0 | |||
| 21858 | #else | |||
| 21859 | pshufd $0x4e, %xmm0, %xmm1 | |||
| 21860 | addpd %xmm1, %xmm0 | |||
| 21861 | #endif | |||
| 21862 | */ | |||
| 21863 | ||||
| 21864 | SDLoc dl(Op); | |||
| 21865 | LLVMContext *Context = DAG.getContext(); | |||
| 21866 | ||||
| 21867 | // Build some magic constants. | |||
| 21868 | static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 }; | |||
| 21869 | Constant *C0 = ConstantDataVector::get(*Context, CV0); | |||
| 21870 | auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); | |||
| 21871 | SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16)); | |||
| 21872 | ||||
| 21873 | SmallVector<Constant*,2> CV1; | |||
| 21874 | CV1.push_back( | |||
| 21875 | ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), | |||
| 21876 | APInt(64, 0x4330000000000000ULL)))); | |||
| 21877 | CV1.push_back( | |||
| 21878 | ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(), | |||
| 21879 | APInt(64, 0x4530000000000000ULL)))); | |||
| 21880 | Constant *C1 = ConstantVector::get(CV1); | |||
| 21881 | SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16)); | |||
| 21882 | ||||
| 21883 | // Load the 64-bit value into an XMM register. | |||
| 21884 | SDValue XR1 = | |||
| 21885 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(0)); | |||
| 21886 | SDValue CLod0 = DAG.getLoad( | |||
| 21887 | MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0, | |||
| 21888 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); | |||
| 21889 | SDValue Unpck1 = | |||
| 21890 | getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0); | |||
| 21891 | ||||
| 21892 | SDValue CLod1 = DAG.getLoad( | |||
| 21893 | MVT::v2f64, dl, CLod0.getValue(1), CPIdx1, | |||
| 21894 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(16)); | |||
| 21895 | SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1); | |||
| 21896 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 21897 | SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1); | |||
| 21898 | SDValue Result; | |||
| 21899 | ||||
| 21900 | if (Subtarget.hasSSE3() && | |||
| 21901 | shouldUseHorizontalOp(true, DAG, Subtarget)) { | |||
| 21902 | Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub); | |||
| 21903 | } else { | |||
| 21904 | SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1}); | |||
| 21905 | Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub); | |||
| 21906 | } | |||
| 21907 | Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result, | |||
| 21908 | DAG.getIntPtrConstant(0, dl)); | |||
| 21909 | return Result; | |||
| 21910 | } | |||
| 21911 | ||||
| 21912 | /// 32-bit unsigned integer to float expansion. | |||
| 21913 | static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG, | |||
| 21914 | const X86Subtarget &Subtarget) { | |||
| 21915 | unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; | |||
| 21916 | SDLoc dl(Op); | |||
| 21917 | // FP constant to bias correct the final result. | |||
| 21918 | SDValue Bias = DAG.getConstantFP( | |||
| 21919 | llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::f64); | |||
| 21920 | ||||
| 21921 | // Load the 32-bit value into an XMM register. | |||
| 21922 | SDValue Load = | |||
| 21923 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo)); | |||
| 21924 | ||||
| 21925 | // Zero out the upper parts of the register. | |||
| 21926 | Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG); | |||
| 21927 | ||||
| 21928 | // Or the load with the bias. | |||
| 21929 | SDValue Or = DAG.getNode( | |||
| 21930 | ISD::OR, dl, MVT::v2i64, | |||
| 21931 | DAG.getBitcast(MVT::v2i64, Load), | |||
| 21932 | DAG.getBitcast(MVT::v2i64, | |||
| 21933 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias))); | |||
| 21934 | Or = | |||
| 21935 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, | |||
| 21936 | DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl)); | |||
| 21937 | ||||
| 21938 | if (Op.getNode()->isStrictFPOpcode()) { | |||
| 21939 | // Subtract the bias. | |||
| 21940 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 21941 | SDValue Chain = Op.getOperand(0); | |||
| 21942 | SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other}, | |||
| 21943 | {Chain, Or, Bias}); | |||
| 21944 | ||||
| 21945 | if (Op.getValueType() == Sub.getValueType()) | |||
| 21946 | return Sub; | |||
| 21947 | ||||
| 21948 | // Handle final rounding. | |||
| 21949 | std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound( | |||
| 21950 | Sub, Sub.getValue(1), dl, Op.getSimpleValueType()); | |||
| 21951 | ||||
| 21952 | return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl); | |||
| 21953 | } | |||
| 21954 | ||||
| 21955 | // Subtract the bias. | |||
| 21956 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 21957 | SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias); | |||
| 21958 | ||||
| 21959 | // Handle final rounding. | |||
| 21960 | return DAG.getFPExtendOrRound(Sub, dl, Op.getSimpleValueType()); | |||
| 21961 | } | |||
| 21962 | ||||
| 21963 | static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG, | |||
| 21964 | const X86Subtarget &Subtarget, | |||
| 21965 | const SDLoc &DL) { | |||
| 21966 | if (Op.getSimpleValueType() != MVT::v2f64) | |||
| 21967 | return SDValue(); | |||
| 21968 | ||||
| 21969 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 21970 | ||||
| 21971 | SDValue N0 = Op.getOperand(IsStrict ? 1 : 0); | |||
| 21972 | assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type")(static_cast <bool> (N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type") ? void (0) : __assert_fail ("N0.getSimpleValueType() == MVT::v2i32 && \"Unexpected input type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 21972, __extension__ __PRETTY_FUNCTION__)); | |||
| 21973 | ||||
| 21974 | if (Subtarget.hasAVX512()) { | |||
| 21975 | if (!Subtarget.hasVLX()) { | |||
| 21976 | // Let generic type legalization widen this. | |||
| 21977 | if (!IsStrict) | |||
| 21978 | return SDValue(); | |||
| 21979 | // Otherwise pad the integer input with 0s and widen the operation. | |||
| 21980 | N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, | |||
| 21981 | DAG.getConstant(0, DL, MVT::v2i32)); | |||
| 21982 | SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other}, | |||
| 21983 | {Op.getOperand(0), N0}); | |||
| 21984 | SDValue Chain = Res.getValue(1); | |||
| 21985 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res, | |||
| 21986 | DAG.getIntPtrConstant(0, DL)); | |||
| 21987 | return DAG.getMergeValues({Res, Chain}, DL); | |||
| 21988 | } | |||
| 21989 | ||||
| 21990 | // Legalize to v4i32 type. | |||
| 21991 | N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, | |||
| 21992 | DAG.getUNDEF(MVT::v2i32)); | |||
| 21993 | if (IsStrict) | |||
| 21994 | return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other}, | |||
| 21995 | {Op.getOperand(0), N0}); | |||
| 21996 | return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0); | |||
| 21997 | } | |||
| 21998 | ||||
| 21999 | // Zero extend to 2i64, OR with the floating point representation of 2^52. | |||
| 22000 | // This gives us the floating point equivalent of 2^52 + the i32 integer | |||
| 22001 | // since double has 52-bits of mantissa. Then subtract 2^52 in floating | |||
| 22002 | // point leaving just our i32 integers in double format. | |||
| 22003 | SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0); | |||
| 22004 | SDValue VBias = DAG.getConstantFP( | |||
| 22005 | llvm::bit_cast<double>(0x4330000000000000ULL), DL, MVT::v2f64); | |||
| 22006 | SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn, | |||
| 22007 | DAG.getBitcast(MVT::v2i64, VBias)); | |||
| 22008 | Or = DAG.getBitcast(MVT::v2f64, Or); | |||
| 22009 | ||||
| 22010 | if (IsStrict) | |||
| 22011 | return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other}, | |||
| 22012 | {Op.getOperand(0), Or, VBias}); | |||
| 22013 | return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias); | |||
| 22014 | } | |||
| 22015 | ||||
| 22016 | static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG, | |||
| 22017 | const X86Subtarget &Subtarget) { | |||
| 22018 | SDLoc DL(Op); | |||
| 22019 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 22020 | SDValue V = Op->getOperand(IsStrict ? 1 : 0); | |||
| 22021 | MVT VecIntVT = V.getSimpleValueType(); | |||
| 22022 | assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type") ? void ( 0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__ __PRETTY_FUNCTION__)) | |||
| 22023 | "Unsupported custom type")(static_cast <bool> ((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && "Unsupported custom type") ? void ( 0) : __assert_fail ("(VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) && \"Unsupported custom type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22023, __extension__ __PRETTY_FUNCTION__)); | |||
| 22024 | ||||
| 22025 | if (Subtarget.hasAVX512()) { | |||
| 22026 | // With AVX512, but not VLX we need to widen to get a 512-bit result type. | |||
| 22027 | assert(!Subtarget.hasVLX() && "Unexpected features")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features" ) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22027, __extension__ __PRETTY_FUNCTION__)); | |||
| 22028 | MVT VT = Op->getSimpleValueType(0); | |||
| 22029 | ||||
| 22030 | // v8i32->v8f64 is legal with AVX512 so just return it. | |||
| 22031 | if (VT == MVT::v8f64) | |||
| 22032 | return Op; | |||
| 22033 | ||||
| 22034 | assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && "Unexpected VT!") ? void (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__ __PRETTY_FUNCTION__)) | |||
| 22035 | "Unexpected VT!")(static_cast <bool> ((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && "Unexpected VT!") ? void (0) : __assert_fail ("(VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22035, __extension__ __PRETTY_FUNCTION__)); | |||
| 22036 | MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; | |||
| 22037 | MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; | |||
| 22038 | // Need to concat with zero vector for strict fp to avoid spurious | |||
| 22039 | // exceptions. | |||
| 22040 | SDValue Tmp = | |||
| 22041 | IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT); | |||
| 22042 | V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V, | |||
| 22043 | DAG.getIntPtrConstant(0, DL)); | |||
| 22044 | SDValue Res, Chain; | |||
| 22045 | if (IsStrict) { | |||
| 22046 | Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other}, | |||
| 22047 | {Op->getOperand(0), V}); | |||
| 22048 | Chain = Res.getValue(1); | |||
| 22049 | } else { | |||
| 22050 | Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V); | |||
| 22051 | } | |||
| 22052 | ||||
| 22053 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 22054 | DAG.getIntPtrConstant(0, DL)); | |||
| 22055 | ||||
| 22056 | if (IsStrict) | |||
| 22057 | return DAG.getMergeValues({Res, Chain}, DL); | |||
| 22058 | return Res; | |||
| 22059 | } | |||
| 22060 | ||||
| 22061 | if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 && | |||
| 22062 | Op->getSimpleValueType(0) == MVT::v4f64) { | |||
| 22063 | SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V); | |||
| 22064 | Constant *Bias = ConstantFP::get( | |||
| 22065 | *DAG.getContext(), | |||
| 22066 | APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL))); | |||
| 22067 | auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); | |||
| 22068 | SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8)); | |||
| 22069 | SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other); | |||
| 22070 | SDValue Ops[] = {DAG.getEntryNode(), CPIdx}; | |||
| 22071 | SDValue VBias = DAG.getMemIntrinsicNode( | |||
| 22072 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64, | |||
| 22073 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8), | |||
| 22074 | MachineMemOperand::MOLoad); | |||
| 22075 | ||||
| 22076 | SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn, | |||
| 22077 | DAG.getBitcast(MVT::v4i64, VBias)); | |||
| 22078 | Or = DAG.getBitcast(MVT::v4f64, Or); | |||
| 22079 | ||||
| 22080 | if (IsStrict) | |||
| 22081 | return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other}, | |||
| 22082 | {Op.getOperand(0), Or, VBias}); | |||
| 22083 | return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias); | |||
| 22084 | } | |||
| 22085 | ||||
| 22086 | // The algorithm is the following: | |||
| 22087 | // #ifdef __SSE4_1__ | |||
| 22088 | // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); | |||
| 22089 | // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), | |||
| 22090 | // (uint4) 0x53000000, 0xaa); | |||
| 22091 | // #else | |||
| 22092 | // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; | |||
| 22093 | // uint4 hi = (v >> 16) | (uint4) 0x53000000; | |||
| 22094 | // #endif | |||
| 22095 | // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); | |||
| 22096 | // return (float4) lo + fhi; | |||
| 22097 | ||||
| 22098 | bool Is128 = VecIntVT == MVT::v4i32; | |||
| 22099 | MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32; | |||
| 22100 | // If we convert to something else than the supported type, e.g., to v4f64, | |||
| 22101 | // abort early. | |||
| 22102 | if (VecFloatVT != Op->getSimpleValueType(0)) | |||
| 22103 | return SDValue(); | |||
| 22104 | ||||
| 22105 | // In the #idef/#else code, we have in common: | |||
| 22106 | // - The vector of constants: | |||
| 22107 | // -- 0x4b000000 | |||
| 22108 | // -- 0x53000000 | |||
| 22109 | // - A shift: | |||
| 22110 | // -- v >> 16 | |||
| 22111 | ||||
| 22112 | // Create the splat vector for 0x4b000000. | |||
| 22113 | SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT); | |||
| 22114 | // Create the splat vector for 0x53000000. | |||
| 22115 | SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT); | |||
| 22116 | ||||
| 22117 | // Create the right shift. | |||
| 22118 | SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT); | |||
| 22119 | SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift); | |||
| 22120 | ||||
| 22121 | SDValue Low, High; | |||
| 22122 | if (Subtarget.hasSSE41()) { | |||
| 22123 | MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16; | |||
| 22124 | // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa); | |||
| 22125 | SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow); | |||
| 22126 | SDValue VecBitcast = DAG.getBitcast(VecI16VT, V); | |||
| 22127 | // Low will be bitcasted right away, so do not bother bitcasting back to its | |||
| 22128 | // original type. | |||
| 22129 | Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast, | |||
| 22130 | VecCstLowBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); | |||
| 22131 | // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16), | |||
| 22132 | // (uint4) 0x53000000, 0xaa); | |||
| 22133 | SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh); | |||
| 22134 | SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift); | |||
| 22135 | // High will be bitcasted right away, so do not bother bitcasting back to | |||
| 22136 | // its original type. | |||
| 22137 | High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast, | |||
| 22138 | VecCstHighBitcast, DAG.getTargetConstant(0xaa, DL, MVT::i8)); | |||
| 22139 | } else { | |||
| 22140 | SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT); | |||
| 22141 | // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000; | |||
| 22142 | SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask); | |||
| 22143 | Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow); | |||
| 22144 | ||||
| 22145 | // uint4 hi = (v >> 16) | (uint4) 0x53000000; | |||
| 22146 | High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh); | |||
| 22147 | } | |||
| 22148 | ||||
| 22149 | // Create the vector constant for (0x1.0p39f + 0x1.0p23f). | |||
| 22150 | SDValue VecCstFSub = DAG.getConstantFP( | |||
| 22151 | APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT); | |||
| 22152 | ||||
| 22153 | // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f); | |||
| 22154 | // NOTE: By using fsub of a positive constant instead of fadd of a negative | |||
| 22155 | // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is | |||
| 22156 | // enabled. See PR24512. | |||
| 22157 | SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High); | |||
| 22158 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 22159 | // (float4) lo; | |||
| 22160 | SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low); | |||
| 22161 | // return (float4) lo + fhi; | |||
| 22162 | if (IsStrict) { | |||
| 22163 | SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other}, | |||
| 22164 | {Op.getOperand(0), HighBitcast, VecCstFSub}); | |||
| 22165 | return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other}, | |||
| 22166 | {FHigh.getValue(1), LowBitcast, FHigh}); | |||
| 22167 | } | |||
| 22168 | ||||
| 22169 | SDValue FHigh = | |||
| 22170 | DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub); | |||
| 22171 | return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh); | |||
| 22172 | } | |||
| 22173 | ||||
| 22174 | static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG, | |||
| 22175 | const X86Subtarget &Subtarget) { | |||
| 22176 | unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0; | |||
| 22177 | SDValue N0 = Op.getOperand(OpNo); | |||
| 22178 | MVT SrcVT = N0.getSimpleValueType(); | |||
| 22179 | SDLoc dl(Op); | |||
| 22180 | ||||
| 22181 | switch (SrcVT.SimpleTy) { | |||
| 22182 | default: | |||
| 22183 | llvm_unreachable("Custom UINT_TO_FP is not supported!")::llvm::llvm_unreachable_internal("Custom UINT_TO_FP is not supported!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22183); | |||
| 22184 | case MVT::v2i32: | |||
| 22185 | return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); | |||
| 22186 | case MVT::v4i32: | |||
| 22187 | case MVT::v8i32: | |||
| 22188 | return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); | |||
| 22189 | case MVT::v2i64: | |||
| 22190 | case MVT::v4i64: | |||
| 22191 | return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget); | |||
| 22192 | } | |||
| 22193 | } | |||
| 22194 | ||||
| 22195 | SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, | |||
| 22196 | SelectionDAG &DAG) const { | |||
| 22197 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 22198 | unsigned OpNo = IsStrict ? 1 : 0; | |||
| 22199 | SDValue Src = Op.getOperand(OpNo); | |||
| 22200 | SDLoc dl(Op); | |||
| 22201 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 22202 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 22203 | MVT DstVT = Op->getSimpleValueType(0); | |||
| 22204 | SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 22205 | ||||
| 22206 | // Bail out when we don't have native conversion instructions. | |||
| 22207 | if (DstVT == MVT::f128) | |||
| 22208 | return SDValue(); | |||
| 22209 | ||||
| 22210 | if (isSoftFP16(DstVT)) | |||
| 22211 | return promoteXINT_TO_FP(Op, DAG); | |||
| 22212 | else if (isLegalConversion(SrcVT, false, Subtarget)) | |||
| 22213 | return Op; | |||
| 22214 | ||||
| 22215 | if (DstVT.isVector()) | |||
| 22216 | return lowerUINT_TO_FP_vec(Op, DAG, Subtarget); | |||
| 22217 | ||||
| 22218 | if (Subtarget.isTargetWin64() && SrcVT == MVT::i128) | |||
| 22219 | return LowerWin64_INT128_TO_FP(Op, DAG); | |||
| 22220 | ||||
| 22221 | if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget)) | |||
| 22222 | return Extract; | |||
| 22223 | ||||
| 22224 | if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) && | |||
| 22225 | (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) { | |||
| 22226 | // Conversions from unsigned i32 to f32/f64 are legal, | |||
| 22227 | // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode. | |||
| 22228 | return Op; | |||
| 22229 | } | |||
| 22230 | ||||
| 22231 | // Promote i32 to i64 and use a signed conversion on 64-bit targets. | |||
| 22232 | if (SrcVT == MVT::i32 && Subtarget.is64Bit()) { | |||
| 22233 | Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src); | |||
| 22234 | if (IsStrict) | |||
| 22235 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other}, | |||
| 22236 | {Chain, Src}); | |||
| 22237 | return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src); | |||
| 22238 | } | |||
| 22239 | ||||
| 22240 | if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget)) | |||
| 22241 | return V; | |||
| 22242 | if (SDValue V = LowerI64IntToFP16(Op, DAG, Subtarget)) | |||
| 22243 | return V; | |||
| 22244 | ||||
| 22245 | // The transform for i64->f64 isn't correct for 0 when rounding to negative | |||
| 22246 | // infinity. It produces -0.0, so disable under strictfp. | |||
| 22247 | if (SrcVT == MVT::i64 && DstVT == MVT::f64 && Subtarget.hasSSE2() && | |||
| 22248 | !IsStrict) | |||
| 22249 | return LowerUINT_TO_FP_i64(Op, DAG, Subtarget); | |||
| 22250 | // The transform for i32->f64/f32 isn't correct for 0 when rounding to | |||
| 22251 | // negative infinity. So disable under strictfp. Using FILD instead. | |||
| 22252 | if (SrcVT == MVT::i32 && Subtarget.hasSSE2() && DstVT != MVT::f80 && | |||
| 22253 | !IsStrict) | |||
| 22254 | return LowerUINT_TO_FP_i32(Op, DAG, Subtarget); | |||
| 22255 | if (Subtarget.is64Bit() && SrcVT == MVT::i64 && | |||
| 22256 | (DstVT == MVT::f32 || DstVT == MVT::f64)) | |||
| 22257 | return SDValue(); | |||
| 22258 | ||||
| 22259 | // Make a 64-bit buffer, and use it to build an FILD. | |||
| 22260 | SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8); | |||
| 22261 | int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex(); | |||
| 22262 | Align SlotAlign(8); | |||
| 22263 | MachinePointerInfo MPI = | |||
| 22264 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI); | |||
| 22265 | if (SrcVT == MVT::i32) { | |||
| 22266 | SDValue OffsetSlot = | |||
| 22267 | DAG.getMemBasePlusOffset(StackSlot, TypeSize::Fixed(4), dl); | |||
| 22268 | SDValue Store1 = DAG.getStore(Chain, dl, Src, StackSlot, MPI, SlotAlign); | |||
| 22269 | SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32), | |||
| 22270 | OffsetSlot, MPI.getWithOffset(4), SlotAlign); | |||
| 22271 | std::pair<SDValue, SDValue> Tmp = | |||
| 22272 | BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, SlotAlign, DAG); | |||
| 22273 | if (IsStrict) | |||
| 22274 | return DAG.getMergeValues({Tmp.first, Tmp.second}, dl); | |||
| 22275 | ||||
| 22276 | return Tmp.first; | |||
| 22277 | } | |||
| 22278 | ||||
| 22279 | assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP")(static_cast <bool> (SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP" ) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && \"Unexpected type in UINT_TO_FP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22279, __extension__ __PRETTY_FUNCTION__)); | |||
| 22280 | SDValue ValueToStore = Src; | |||
| 22281 | if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) { | |||
| 22282 | // Bitcasting to f64 here allows us to do a single 64-bit store from | |||
| 22283 | // an SSE register, avoiding the store forwarding penalty that would come | |||
| 22284 | // with two 32-bit stores. | |||
| 22285 | ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore); | |||
| 22286 | } | |||
| 22287 | SDValue Store = | |||
| 22288 | DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, SlotAlign); | |||
| 22289 | // For i64 source, we need to add the appropriate power of 2 if the input | |||
| 22290 | // was negative. We must be careful to do the computation in x87 extended | |||
| 22291 | // precision, not in SSE. | |||
| 22292 | SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 22293 | SDValue Ops[] = { Store, StackSlot }; | |||
| 22294 | SDValue Fild = | |||
| 22295 | DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI, | |||
| 22296 | SlotAlign, MachineMemOperand::MOLoad); | |||
| 22297 | Chain = Fild.getValue(1); | |||
| 22298 | ||||
| 22299 | ||||
| 22300 | // Check whether the sign bit is set. | |||
| 22301 | SDValue SignSet = DAG.getSetCC( | |||
| 22302 | dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64), | |||
| 22303 | Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT); | |||
| 22304 | ||||
| 22305 | // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits. | |||
| 22306 | APInt FF(64, 0x5F80000000000000ULL); | |||
| 22307 | SDValue FudgePtr = DAG.getConstantPool( | |||
| 22308 | ConstantInt::get(*DAG.getContext(), FF), PtrVT); | |||
| 22309 | Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign(); | |||
| 22310 | ||||
| 22311 | // Get a pointer to FF if the sign bit was set, or to 0 otherwise. | |||
| 22312 | SDValue Zero = DAG.getIntPtrConstant(0, dl); | |||
| 22313 | SDValue Four = DAG.getIntPtrConstant(4, dl); | |||
| 22314 | SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero); | |||
| 22315 | FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); | |||
| 22316 | ||||
| 22317 | // Load the value out, extending it from f32 to f80. | |||
| 22318 | SDValue Fudge = DAG.getExtLoad( | |||
| 22319 | ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr, | |||
| 22320 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32, | |||
| 22321 | CPAlignment); | |||
| 22322 | Chain = Fudge.getValue(1); | |||
| 22323 | // Extend everything to 80 bits to force it to be done on x87. | |||
| 22324 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 22325 | if (IsStrict) { | |||
| 22326 | unsigned Opc = ISD::STRICT_FADD; | |||
| 22327 | // Windows needs the precision control changed to 80bits around this add. | |||
| 22328 | if (Subtarget.isOSWindows() && DstVT == MVT::f32) | |||
| 22329 | Opc = X86ISD::STRICT_FP80_ADD; | |||
| 22330 | ||||
| 22331 | SDValue Add = | |||
| 22332 | DAG.getNode(Opc, dl, {MVT::f80, MVT::Other}, {Chain, Fild, Fudge}); | |||
| 22333 | // STRICT_FP_ROUND can't handle equal types. | |||
| 22334 | if (DstVT == MVT::f80) | |||
| 22335 | return Add; | |||
| 22336 | return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other}, | |||
| 22337 | {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)}); | |||
| 22338 | } | |||
| 22339 | unsigned Opc = ISD::FADD; | |||
| 22340 | // Windows needs the precision control changed to 80bits around this add. | |||
| 22341 | if (Subtarget.isOSWindows() && DstVT == MVT::f32) | |||
| 22342 | Opc = X86ISD::FP80_ADD; | |||
| 22343 | ||||
| 22344 | SDValue Add = DAG.getNode(Opc, dl, MVT::f80, Fild, Fudge); | |||
| 22345 | return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, | |||
| 22346 | DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); | |||
| 22347 | } | |||
| 22348 | ||||
| 22349 | // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation | |||
| 22350 | // is legal, or has an fp128 or f16 source (which needs to be promoted to f32), | |||
| 22351 | // just return an SDValue(). | |||
| 22352 | // Otherwise it is assumed to be a conversion from one of f32, f64 or f80 | |||
| 22353 | // to i16, i32 or i64, and we lower it to a legal sequence and return the | |||
| 22354 | // result. | |||
| 22355 | SDValue | |||
| 22356 | X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, | |||
| 22357 | bool IsSigned, SDValue &Chain) const { | |||
| 22358 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 22359 | SDLoc DL(Op); | |||
| 22360 | ||||
| 22361 | EVT DstTy = Op.getValueType(); | |||
| 22362 | SDValue Value = Op.getOperand(IsStrict ? 1 : 0); | |||
| 22363 | EVT TheVT = Value.getValueType(); | |||
| 22364 | auto PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 22365 | ||||
| 22366 | if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) { | |||
| 22367 | // f16 must be promoted before using the lowering in this routine. | |||
| 22368 | // fp128 does not use this lowering. | |||
| 22369 | return SDValue(); | |||
| 22370 | } | |||
| 22371 | ||||
| 22372 | // If using FIST to compute an unsigned i64, we'll need some fixup | |||
| 22373 | // to handle values above the maximum signed i64. A FIST is always | |||
| 22374 | // used for the 32-bit subtarget, but also for f80 on a 64-bit target. | |||
| 22375 | bool UnsignedFixup = !IsSigned && DstTy == MVT::i64; | |||
| 22376 | ||||
| 22377 | // FIXME: This does not generate an invalid exception if the input does not | |||
| 22378 | // fit in i32. PR44019 | |||
| 22379 | if (!IsSigned && DstTy != MVT::i64) { | |||
| 22380 | // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST. | |||
| 22381 | // The low 32 bits of the fist result will have the correct uint32 result. | |||
| 22382 | assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT")(static_cast <bool> (DstTy == MVT::i32 && "Unexpected FP_TO_UINT" ) ? void (0) : __assert_fail ("DstTy == MVT::i32 && \"Unexpected FP_TO_UINT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22382, __extension__ __PRETTY_FUNCTION__)); | |||
| 22383 | DstTy = MVT::i64; | |||
| 22384 | } | |||
| 22385 | ||||
| 22386 | assert(DstTy.getSimpleVT() <= MVT::i64 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!" ) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__ __PRETTY_FUNCTION__)) | |||
| 22387 | DstTy.getSimpleVT() >= MVT::i16 &&(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!" ) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__ __PRETTY_FUNCTION__)) | |||
| 22388 | "Unknown FP_TO_INT to lower!")(static_cast <bool> (DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && "Unknown FP_TO_INT to lower!" ) ? void (0) : __assert_fail ("DstTy.getSimpleVT() <= MVT::i64 && DstTy.getSimpleVT() >= MVT::i16 && \"Unknown FP_TO_INT to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22388, __extension__ __PRETTY_FUNCTION__)); | |||
| 22389 | ||||
| 22390 | // We lower FP->int64 into FISTP64 followed by a load from a temporary | |||
| 22391 | // stack slot. | |||
| 22392 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 22393 | unsigned MemSize = DstTy.getStoreSize(); | |||
| 22394 | int SSFI = | |||
| 22395 | MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false); | |||
| 22396 | SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT); | |||
| 22397 | ||||
| 22398 | Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 22399 | ||||
| 22400 | SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment. | |||
| 22401 | ||||
| 22402 | if (UnsignedFixup) { | |||
| 22403 | // | |||
| 22404 | // Conversion to unsigned i64 is implemented with a select, | |||
| 22405 | // depending on whether the source value fits in the range | |||
| 22406 | // of a signed i64. Let Thresh be the FP equivalent of | |||
| 22407 | // 0x8000000000000000ULL. | |||
| 22408 | // | |||
| 22409 | // Adjust = (Value >= Thresh) ? 0x80000000 : 0; | |||
| 22410 | // FltOfs = (Value >= Thresh) ? 0x80000000 : 0; | |||
| 22411 | // FistSrc = (Value - FltOfs); | |||
| 22412 | // Fist-to-mem64 FistSrc | |||
| 22413 | // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent | |||
| 22414 | // to XOR'ing the high 32 bits with Adjust. | |||
| 22415 | // | |||
| 22416 | // Being a power of 2, Thresh is exactly representable in all FP formats. | |||
| 22417 | // For X87 we'd like to use the smallest FP type for this constant, but | |||
| 22418 | // for DAG type consistency we have to match the FP operand type. | |||
| 22419 | ||||
| 22420 | APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000)); | |||
| 22421 | LLVM_ATTRIBUTE_UNUSED__attribute__((__unused__)) APFloat::opStatus Status = APFloat::opOK; | |||
| 22422 | bool LosesInfo = false; | |||
| 22423 | if (TheVT == MVT::f64) | |||
| 22424 | // The rounding mode is irrelevant as the conversion should be exact. | |||
| 22425 | Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven, | |||
| 22426 | &LosesInfo); | |||
| 22427 | else if (TheVT == MVT::f80) | |||
| 22428 | Status = Thresh.convert(APFloat::x87DoubleExtended(), | |||
| 22429 | APFloat::rmNearestTiesToEven, &LosesInfo); | |||
| 22430 | ||||
| 22431 | assert(Status == APFloat::opOK && !LosesInfo &&(static_cast <bool> (Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact" ) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__ __PRETTY_FUNCTION__)) | |||
| 22432 | "FP conversion should have been exact")(static_cast <bool> (Status == APFloat::opOK && !LosesInfo && "FP conversion should have been exact" ) ? void (0) : __assert_fail ("Status == APFloat::opOK && !LosesInfo && \"FP conversion should have been exact\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22432, __extension__ __PRETTY_FUNCTION__)); | |||
| 22433 | ||||
| 22434 | SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT); | |||
| 22435 | ||||
| 22436 | EVT ResVT = getSetCCResultType(DAG.getDataLayout(), | |||
| 22437 | *DAG.getContext(), TheVT); | |||
| 22438 | SDValue Cmp; | |||
| 22439 | if (IsStrict) { | |||
| 22440 | Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE, Chain, | |||
| 22441 | /*IsSignaling*/ true); | |||
| 22442 | Chain = Cmp.getValue(1); | |||
| 22443 | } else { | |||
| 22444 | Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETGE); | |||
| 22445 | } | |||
| 22446 | ||||
| 22447 | // Our preferred lowering of | |||
| 22448 | // | |||
| 22449 | // (Value >= Thresh) ? 0x8000000000000000ULL : 0 | |||
| 22450 | // | |||
| 22451 | // is | |||
| 22452 | // | |||
| 22453 | // (Value >= Thresh) << 63 | |||
| 22454 | // | |||
| 22455 | // but since we can get here after LegalOperations, DAGCombine might do the | |||
| 22456 | // wrong thing if we create a select. So, directly create the preferred | |||
| 22457 | // version. | |||
| 22458 | SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Cmp); | |||
| 22459 | SDValue Const63 = DAG.getConstant(63, DL, MVT::i8); | |||
| 22460 | Adjust = DAG.getNode(ISD::SHL, DL, MVT::i64, Zext, Const63); | |||
| 22461 | ||||
| 22462 | SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp, ThreshVal, | |||
| 22463 | DAG.getConstantFP(0.0, DL, TheVT)); | |||
| 22464 | ||||
| 22465 | if (IsStrict) { | |||
| 22466 | Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other}, | |||
| 22467 | { Chain, Value, FltOfs }); | |||
| 22468 | Chain = Value.getValue(1); | |||
| 22469 | } else | |||
| 22470 | Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs); | |||
| 22471 | } | |||
| 22472 | ||||
| 22473 | MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); | |||
| 22474 | ||||
| 22475 | // FIXME This causes a redundant load/store if the SSE-class value is already | |||
| 22476 | // in memory, such as if it is on the callstack. | |||
| 22477 | if (isScalarFPTypeInSSEReg(TheVT)) { | |||
| 22478 | assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!")(static_cast <bool> (DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!" ) ? void (0) : __assert_fail ("DstTy == MVT::i64 && \"Invalid FP_TO_SINT to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22478, __extension__ __PRETTY_FUNCTION__)); | |||
| 22479 | Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI); | |||
| 22480 | SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 22481 | SDValue Ops[] = { Chain, StackSlot }; | |||
| 22482 | ||||
| 22483 | unsigned FLDSize = TheVT.getStoreSize(); | |||
| 22484 | assert(FLDSize <= MemSize && "Stack slot not big enough")(static_cast <bool> (FLDSize <= MemSize && "Stack slot not big enough" ) ? void (0) : __assert_fail ("FLDSize <= MemSize && \"Stack slot not big enough\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22484, __extension__ __PRETTY_FUNCTION__)); | |||
| 22485 | MachineMemOperand *MMO = MF.getMachineMemOperand( | |||
| 22486 | MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize)); | |||
| 22487 | Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO); | |||
| 22488 | Chain = Value.getValue(1); | |||
| 22489 | } | |||
| 22490 | ||||
| 22491 | // Build the FP_TO_INT*_IN_MEM | |||
| 22492 | MachineMemOperand *MMO = MF.getMachineMemOperand( | |||
| 22493 | MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize)); | |||
| 22494 | SDValue Ops[] = { Chain, Value, StackSlot }; | |||
| 22495 | SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL, | |||
| 22496 | DAG.getVTList(MVT::Other), | |||
| 22497 | Ops, DstTy, MMO); | |||
| 22498 | ||||
| 22499 | SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI); | |||
| 22500 | Chain = Res.getValue(1); | |||
| 22501 | ||||
| 22502 | // If we need an unsigned fixup, XOR the result with adjust. | |||
| 22503 | if (UnsignedFixup) | |||
| 22504 | Res = DAG.getNode(ISD::XOR, DL, MVT::i64, Res, Adjust); | |||
| 22505 | ||||
| 22506 | return Res; | |||
| 22507 | } | |||
| 22508 | ||||
| 22509 | static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, | |||
| 22510 | const X86Subtarget &Subtarget) { | |||
| 22511 | MVT VT = Op.getSimpleValueType(); | |||
| 22512 | SDValue In = Op.getOperand(0); | |||
| 22513 | MVT InVT = In.getSimpleValueType(); | |||
| 22514 | SDLoc dl(Op); | |||
| 22515 | unsigned Opc = Op.getOpcode(); | |||
| 22516 | ||||
| 22517 | assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector () && "Expected vector type") ? void (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22517, __extension__ __PRETTY_FUNCTION__)); | |||
| 22518 | assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD ::ZERO_EXTEND) && "Unexpected extension opcode") ? void (0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__ __PRETTY_FUNCTION__)) | |||
| 22519 | "Unexpected extension opcode")(static_cast <bool> ((Opc == ISD::ANY_EXTEND || Opc == ISD ::ZERO_EXTEND) && "Unexpected extension opcode") ? void (0) : __assert_fail ("(Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) && \"Unexpected extension opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22519, __extension__ __PRETTY_FUNCTION__)); | |||
| 22520 | assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Expected same number of elements" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__ __PRETTY_FUNCTION__)) | |||
| 22521 | "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Expected same number of elements" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22521, __extension__ __PRETTY_FUNCTION__)); | |||
| 22522 | assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__ __PRETTY_FUNCTION__)) | |||
| 22523 | VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__ __PRETTY_FUNCTION__)) | |||
| 22524 | VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__ __PRETTY_FUNCTION__)) | |||
| 22525 | "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22525, __extension__ __PRETTY_FUNCTION__)); | |||
| 22526 | assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__ __PRETTY_FUNCTION__)) | |||
| 22527 | InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__ __PRETTY_FUNCTION__)) | |||
| 22528 | InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__ __PRETTY_FUNCTION__)) | |||
| 22529 | "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22529, __extension__ __PRETTY_FUNCTION__)); | |||
| 22530 | ||||
| 22531 | unsigned ExtendInVecOpc = DAG.getOpcode_EXTEND_VECTOR_INREG(Opc); | |||
| 22532 | ||||
| 22533 | if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { | |||
| 22534 | assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!" ) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22534, __extension__ __PRETTY_FUNCTION__)); | |||
| 22535 | return splitVectorIntUnary(Op, DAG); | |||
| 22536 | } | |||
| 22537 | ||||
| 22538 | if (Subtarget.hasInt256()) | |||
| 22539 | return Op; | |||
| 22540 | ||||
| 22541 | // Optimize vectors in AVX mode: | |||
| 22542 | // | |||
| 22543 | // v8i16 -> v8i32 | |||
| 22544 | // Use vpmovzwd for 4 lower elements v8i16 -> v4i32. | |||
| 22545 | // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32. | |||
| 22546 | // Concat upper and lower parts. | |||
| 22547 | // | |||
| 22548 | // v4i32 -> v4i64 | |||
| 22549 | // Use vpmovzdq for 4 lower elements v4i32 -> v2i64. | |||
| 22550 | // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64. | |||
| 22551 | // Concat upper and lower parts. | |||
| 22552 | // | |||
| 22553 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 22554 | SDValue OpLo = DAG.getNode(ExtendInVecOpc, dl, HalfVT, In); | |||
| 22555 | ||||
| 22556 | // Short-circuit if we can determine that each 128-bit half is the same value. | |||
| 22557 | // Otherwise, this is difficult to match and optimize. | |||
| 22558 | if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(In)) | |||
| 22559 | if (hasIdenticalHalvesShuffleMask(Shuf->getMask())) | |||
| 22560 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpLo); | |||
| 22561 | ||||
| 22562 | SDValue ZeroVec = DAG.getConstant(0, dl, InVT); | |||
| 22563 | SDValue Undef = DAG.getUNDEF(InVT); | |||
| 22564 | bool NeedZero = Opc == ISD::ZERO_EXTEND; | |||
| 22565 | SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef); | |||
| 22566 | OpHi = DAG.getBitcast(HalfVT, OpHi); | |||
| 22567 | ||||
| 22568 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); | |||
| 22569 | } | |||
| 22570 | ||||
| 22571 | // Helper to split and extend a v16i1 mask to v16i8 or v16i16. | |||
| 22572 | static SDValue SplitAndExtendv16i1(unsigned ExtOpc, MVT VT, SDValue In, | |||
| 22573 | const SDLoc &dl, SelectionDAG &DAG) { | |||
| 22574 | assert((VT == MVT::v16i8 || VT == MVT::v16i16) && "Unexpected VT.")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v16i16 ) && "Unexpected VT.") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v16i16) && \"Unexpected VT.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22574, __extension__ __PRETTY_FUNCTION__)); | |||
| 22575 | SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, | |||
| 22576 | DAG.getIntPtrConstant(0, dl)); | |||
| 22577 | SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i1, In, | |||
| 22578 | DAG.getIntPtrConstant(8, dl)); | |||
| 22579 | Lo = DAG.getNode(ExtOpc, dl, MVT::v8i16, Lo); | |||
| 22580 | Hi = DAG.getNode(ExtOpc, dl, MVT::v8i16, Hi); | |||
| 22581 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i16, Lo, Hi); | |||
| 22582 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 22583 | } | |||
| 22584 | ||||
| 22585 | static SDValue LowerZERO_EXTEND_Mask(SDValue Op, | |||
| 22586 | const X86Subtarget &Subtarget, | |||
| 22587 | SelectionDAG &DAG) { | |||
| 22588 | MVT VT = Op->getSimpleValueType(0); | |||
| 22589 | SDValue In = Op->getOperand(0); | |||
| 22590 | MVT InVT = In.getSimpleValueType(); | |||
| 22591 | assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT ::i1 && "Unexpected input type!") ? void (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22591, __extension__ __PRETTY_FUNCTION__)); | |||
| 22592 | SDLoc DL(Op); | |||
| 22593 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 22594 | ||||
| 22595 | // For all vectors, but vXi8 we can just emit a sign_extend and a shift. This | |||
| 22596 | // avoids a constant pool load. | |||
| 22597 | if (VT.getVectorElementType() != MVT::i8) { | |||
| 22598 | SDValue Extend = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, In); | |||
| 22599 | return DAG.getNode(ISD::SRL, DL, VT, Extend, | |||
| 22600 | DAG.getConstant(VT.getScalarSizeInBits() - 1, DL, VT)); | |||
| 22601 | } | |||
| 22602 | ||||
| 22603 | // Extend VT if BWI is not supported. | |||
| 22604 | MVT ExtVT = VT; | |||
| 22605 | if (!Subtarget.hasBWI()) { | |||
| 22606 | // If v16i32 is to be avoided, we'll need to split and concatenate. | |||
| 22607 | if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) | |||
| 22608 | return SplitAndExtendv16i1(ISD::ZERO_EXTEND, VT, In, DL, DAG); | |||
| 22609 | ||||
| 22610 | ExtVT = MVT::getVectorVT(MVT::i32, NumElts); | |||
| 22611 | } | |||
| 22612 | ||||
| 22613 | // Widen to 512-bits if VLX is not supported. | |||
| 22614 | MVT WideVT = ExtVT; | |||
| 22615 | if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { | |||
| 22616 | NumElts *= 512 / ExtVT.getSizeInBits(); | |||
| 22617 | InVT = MVT::getVectorVT(MVT::i1, NumElts); | |||
| 22618 | In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), | |||
| 22619 | In, DAG.getIntPtrConstant(0, DL)); | |||
| 22620 | WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), | |||
| 22621 | NumElts); | |||
| 22622 | } | |||
| 22623 | ||||
| 22624 | SDValue One = DAG.getConstant(1, DL, WideVT); | |||
| 22625 | SDValue Zero = DAG.getConstant(0, DL, WideVT); | |||
| 22626 | ||||
| 22627 | SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); | |||
| 22628 | ||||
| 22629 | // Truncate if we had to extend above. | |||
| 22630 | if (VT != ExtVT) { | |||
| 22631 | WideVT = MVT::getVectorVT(MVT::i8, NumElts); | |||
| 22632 | SelectedVal = DAG.getNode(ISD::TRUNCATE, DL, WideVT, SelectedVal); | |||
| 22633 | } | |||
| 22634 | ||||
| 22635 | // Extract back to 128/256-bit if we widened. | |||
| 22636 | if (WideVT != VT) | |||
| 22637 | SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, | |||
| 22638 | DAG.getIntPtrConstant(0, DL)); | |||
| 22639 | ||||
| 22640 | return SelectedVal; | |||
| 22641 | } | |||
| 22642 | ||||
| 22643 | static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, | |||
| 22644 | SelectionDAG &DAG) { | |||
| 22645 | SDValue In = Op.getOperand(0); | |||
| 22646 | MVT SVT = In.getSimpleValueType(); | |||
| 22647 | ||||
| 22648 | if (SVT.getVectorElementType() == MVT::i1) | |||
| 22649 | return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); | |||
| 22650 | ||||
| 22651 | assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22651, __extension__ __PRETTY_FUNCTION__)); | |||
| 22652 | return LowerAVXExtend(Op, DAG, Subtarget); | |||
| 22653 | } | |||
| 22654 | ||||
| 22655 | /// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. | |||
| 22656 | /// It makes use of the fact that vectors with enough leading sign/zero bits | |||
| 22657 | /// prevent the PACKSS/PACKUS from saturating the results. | |||
| 22658 | /// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates | |||
| 22659 | /// within each 128-bit lane. | |||
| 22660 | static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, | |||
| 22661 | const SDLoc &DL, SelectionDAG &DAG, | |||
| 22662 | const X86Subtarget &Subtarget) { | |||
| 22663 | assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode") ? void (0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__ __PRETTY_FUNCTION__)) | |||
| 22664 | "Unexpected PACK opcode")(static_cast <bool> ((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && "Unexpected PACK opcode") ? void (0) : __assert_fail ("(Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && \"Unexpected PACK opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22664, __extension__ __PRETTY_FUNCTION__)); | |||
| 22665 | assert(DstVT.isVector() && "VT not a vector?")(static_cast <bool> (DstVT.isVector() && "VT not a vector?" ) ? void (0) : __assert_fail ("DstVT.isVector() && \"VT not a vector?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22665, __extension__ __PRETTY_FUNCTION__)); | |||
| 22666 | ||||
| 22667 | // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below). | |||
| 22668 | if (!Subtarget.hasSSE2()) | |||
| 22669 | return SDValue(); | |||
| 22670 | ||||
| 22671 | EVT SrcVT = In.getValueType(); | |||
| 22672 | ||||
| 22673 | // No truncation required, we might get here due to recursive calls. | |||
| 22674 | if (SrcVT == DstVT) | |||
| 22675 | return In; | |||
| 22676 | ||||
| 22677 | // We only support vector truncation to 64bits or greater from a | |||
| 22678 | // 128bits or greater source. | |||
| 22679 | unsigned DstSizeInBits = DstVT.getSizeInBits(); | |||
| 22680 | unsigned SrcSizeInBits = SrcVT.getSizeInBits(); | |||
| 22681 | if ((DstSizeInBits % 64) != 0 || (SrcSizeInBits % 128) != 0) | |||
| 22682 | return SDValue(); | |||
| 22683 | ||||
| 22684 | unsigned NumElems = SrcVT.getVectorNumElements(); | |||
| 22685 | if (!isPowerOf2_32(NumElems)) | |||
| 22686 | return SDValue(); | |||
| 22687 | ||||
| 22688 | LLVMContext &Ctx = *DAG.getContext(); | |||
| 22689 | assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation")(static_cast <bool> (DstVT.getVectorNumElements() == NumElems && "Illegal truncation") ? void (0) : __assert_fail ( "DstVT.getVectorNumElements() == NumElems && \"Illegal truncation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22689, __extension__ __PRETTY_FUNCTION__)); | |||
| 22690 | assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation")(static_cast <bool> (SrcSizeInBits > DstSizeInBits && "Illegal truncation") ? void (0) : __assert_fail ("SrcSizeInBits > DstSizeInBits && \"Illegal truncation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22690, __extension__ __PRETTY_FUNCTION__)); | |||
| 22691 | ||||
| 22692 | EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); | |||
| 22693 | ||||
| 22694 | // Pack to the largest type possible: | |||
| 22695 | // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. | |||
| 22696 | EVT InVT = MVT::i16, OutVT = MVT::i8; | |||
| 22697 | if (SrcVT.getScalarSizeInBits() > 16 && | |||
| 22698 | (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { | |||
| 22699 | InVT = MVT::i32; | |||
| 22700 | OutVT = MVT::i16; | |||
| 22701 | } | |||
| 22702 | ||||
| 22703 | // 128bit -> 64bit truncate - PACK 128-bit src in the lower subvector. | |||
| 22704 | if (SrcVT.is128BitVector()) { | |||
| 22705 | InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits()); | |||
| 22706 | OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits()); | |||
| 22707 | In = DAG.getBitcast(InVT, In); | |||
| 22708 | SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT)); | |||
| 22709 | Res = extractSubVector(Res, 0, DAG, DL, 64); | |||
| 22710 | return DAG.getBitcast(DstVT, Res); | |||
| 22711 | } | |||
| 22712 | ||||
| 22713 | // Split lower/upper subvectors. | |||
| 22714 | SDValue Lo, Hi; | |||
| 22715 | std::tie(Lo, Hi) = splitVector(In, DAG, DL); | |||
| 22716 | ||||
| 22717 | unsigned SubSizeInBits = SrcSizeInBits / 2; | |||
| 22718 | InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); | |||
| 22719 | OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); | |||
| 22720 | ||||
| 22721 | // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. | |||
| 22722 | if (SrcVT.is256BitVector() && DstVT.is128BitVector()) { | |||
| 22723 | Lo = DAG.getBitcast(InVT, Lo); | |||
| 22724 | Hi = DAG.getBitcast(InVT, Hi); | |||
| 22725 | SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); | |||
| 22726 | return DAG.getBitcast(DstVT, Res); | |||
| 22727 | } | |||
| 22728 | ||||
| 22729 | // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. | |||
| 22730 | // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). | |||
| 22731 | if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { | |||
| 22732 | Lo = DAG.getBitcast(InVT, Lo); | |||
| 22733 | Hi = DAG.getBitcast(InVT, Hi); | |||
| 22734 | SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); | |||
| 22735 | ||||
| 22736 | // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), | |||
| 22737 | // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). | |||
| 22738 | // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits. | |||
| 22739 | SmallVector<int, 64> Mask; | |||
| 22740 | int Scale = 64 / OutVT.getScalarSizeInBits(); | |||
| 22741 | narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask); | |||
| 22742 | Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask); | |||
| 22743 | ||||
| 22744 | if (DstVT.is256BitVector()) | |||
| 22745 | return DAG.getBitcast(DstVT, Res); | |||
| 22746 | ||||
| 22747 | // If 512bit -> 128bit truncate another stage. | |||
| 22748 | EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); | |||
| 22749 | Res = DAG.getBitcast(PackedVT, Res); | |||
| 22750 | return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); | |||
| 22751 | } | |||
| 22752 | ||||
| 22753 | // Recursively pack lower/upper subvectors, concat result and pack again. | |||
| 22754 | assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater")(static_cast <bool> (SrcSizeInBits >= 256 && "Expected 256-bit vector or greater") ? void (0) : __assert_fail ("SrcSizeInBits >= 256 && \"Expected 256-bit vector or greater\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22754, __extension__ __PRETTY_FUNCTION__)); | |||
| 22755 | EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2); | |||
| 22756 | Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); | |||
| 22757 | Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); | |||
| 22758 | ||||
| 22759 | PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); | |||
| 22760 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); | |||
| 22761 | return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); | |||
| 22762 | } | |||
| 22763 | ||||
| 22764 | static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, | |||
| 22765 | const X86Subtarget &Subtarget) { | |||
| 22766 | ||||
| 22767 | SDLoc DL(Op); | |||
| 22768 | MVT VT = Op.getSimpleValueType(); | |||
| 22769 | SDValue In = Op.getOperand(0); | |||
| 22770 | MVT InVT = In.getSimpleValueType(); | |||
| 22771 | ||||
| 22772 | assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.")(static_cast <bool> (VT.getVectorElementType() == MVT:: i1 && "Unexpected vector type.") ? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Unexpected vector type.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22772, __extension__ __PRETTY_FUNCTION__)); | |||
| 22773 | ||||
| 22774 | // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q. | |||
| 22775 | unsigned ShiftInx = InVT.getScalarSizeInBits() - 1; | |||
| 22776 | if (InVT.getScalarSizeInBits() <= 16) { | |||
| 22777 | if (Subtarget.hasBWI()) { | |||
| 22778 | // legal, will go to VPMOVB2M, VPMOVW2M | |||
| 22779 | if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { | |||
| 22780 | // We need to shift to get the lsb into sign position. | |||
| 22781 | // Shift packed bytes not supported natively, bitcast to word | |||
| 22782 | MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16); | |||
| 22783 | In = DAG.getNode(ISD::SHL, DL, ExtVT, | |||
| 22784 | DAG.getBitcast(ExtVT, In), | |||
| 22785 | DAG.getConstant(ShiftInx, DL, ExtVT)); | |||
| 22786 | In = DAG.getBitcast(InVT, In); | |||
| 22787 | } | |||
| 22788 | return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), | |||
| 22789 | In, ISD::SETGT); | |||
| 22790 | } | |||
| 22791 | // Use TESTD/Q, extended vector to packed dword/qword. | |||
| 22792 | assert((InVT.is256BitVector() || InVT.is128BitVector()) &&(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector ()) && "Unexpected vector type.") ? void (0) : __assert_fail ("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__ __PRETTY_FUNCTION__)) | |||
| 22793 | "Unexpected vector type.")(static_cast <bool> ((InVT.is256BitVector() || InVT.is128BitVector ()) && "Unexpected vector type.") ? void (0) : __assert_fail ("(InVT.is256BitVector() || InVT.is128BitVector()) && \"Unexpected vector type.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22793, __extension__ __PRETTY_FUNCTION__)); | |||
| 22794 | unsigned NumElts = InVT.getVectorNumElements(); | |||
| 22795 | assert((NumElts == 8 || NumElts == 16) && "Unexpected number of elements")(static_cast <bool> ((NumElts == 8 || NumElts == 16) && "Unexpected number of elements") ? void (0) : __assert_fail ( "(NumElts == 8 || NumElts == 16) && \"Unexpected number of elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22795, __extension__ __PRETTY_FUNCTION__)); | |||
| 22796 | // We need to change to a wider element type that we have support for. | |||
| 22797 | // For 8 element vectors this is easy, we either extend to v8i32 or v8i64. | |||
| 22798 | // For 16 element vectors we extend to v16i32 unless we are explicitly | |||
| 22799 | // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors | |||
| 22800 | // we need to split into two 8 element vectors which we can extend to v8i32, | |||
| 22801 | // truncate and concat the results. There's an additional complication if | |||
| 22802 | // the original type is v16i8. In that case we can't split the v16i8 | |||
| 22803 | // directly, so we need to shuffle high elements to low and use | |||
| 22804 | // sign_extend_vector_inreg. | |||
| 22805 | if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) { | |||
| 22806 | SDValue Lo, Hi; | |||
| 22807 | if (InVT == MVT::v16i8) { | |||
| 22808 | Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In); | |||
| 22809 | Hi = DAG.getVectorShuffle( | |||
| 22810 | InVT, DL, In, In, | |||
| 22811 | {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); | |||
| 22812 | Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi); | |||
| 22813 | } else { | |||
| 22814 | assert(InVT == MVT::v16i16 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v16i16 && "Unexpected VT!" ) ? void (0) : __assert_fail ("InVT == MVT::v16i16 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22814, __extension__ __PRETTY_FUNCTION__)); | |||
| 22815 | Lo = extract128BitVector(In, 0, DAG, DL); | |||
| 22816 | Hi = extract128BitVector(In, 8, DAG, DL); | |||
| 22817 | } | |||
| 22818 | // We're split now, just emit two truncates and a concat. The two | |||
| 22819 | // truncates will trigger legalization to come back to this function. | |||
| 22820 | Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo); | |||
| 22821 | Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Hi); | |||
| 22822 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); | |||
| 22823 | } | |||
| 22824 | // We either have 8 elements or we're allowed to use 512-bit vectors. | |||
| 22825 | // If we have VLX, we want to use the narrowest vector that can get the | |||
| 22826 | // job done so we use vXi32. | |||
| 22827 | MVT EltVT = Subtarget.hasVLX() ? MVT::i32 : MVT::getIntegerVT(512/NumElts); | |||
| 22828 | MVT ExtVT = MVT::getVectorVT(EltVT, NumElts); | |||
| 22829 | In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In); | |||
| 22830 | InVT = ExtVT; | |||
| 22831 | ShiftInx = InVT.getScalarSizeInBits() - 1; | |||
| 22832 | } | |||
| 22833 | ||||
| 22834 | if (DAG.ComputeNumSignBits(In) < InVT.getScalarSizeInBits()) { | |||
| 22835 | // We need to shift to get the lsb into sign position. | |||
| 22836 | In = DAG.getNode(ISD::SHL, DL, InVT, In, | |||
| 22837 | DAG.getConstant(ShiftInx, DL, InVT)); | |||
| 22838 | } | |||
| 22839 | // If we have DQI, emit a pattern that will be iseled as vpmovq2m/vpmovd2m. | |||
| 22840 | if (Subtarget.hasDQI()) | |||
| 22841 | return DAG.getSetCC(DL, VT, DAG.getConstant(0, DL, InVT), In, ISD::SETGT); | |||
| 22842 | return DAG.getSetCC(DL, VT, In, DAG.getConstant(0, DL, InVT), ISD::SETNE); | |||
| 22843 | } | |||
| 22844 | ||||
| 22845 | SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { | |||
| 22846 | SDLoc DL(Op); | |||
| 22847 | MVT VT = Op.getSimpleValueType(); | |||
| 22848 | SDValue In = Op.getOperand(0); | |||
| 22849 | MVT InVT = In.getSimpleValueType(); | |||
| 22850 | unsigned InNumEltBits = InVT.getScalarSizeInBits(); | |||
| 22851 | ||||
| 22852 | assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Invalid TRUNCATE operation" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__ __PRETTY_FUNCTION__)) | |||
| 22853 | "Invalid TRUNCATE operation")(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Invalid TRUNCATE operation" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Invalid TRUNCATE operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22853, __extension__ __PRETTY_FUNCTION__)); | |||
| 22854 | ||||
| 22855 | // If we're called by the type legalizer, handle a few cases. | |||
| 22856 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 22857 | if (!TLI.isTypeLegal(InVT)) { | |||
| 22858 | if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) && | |||
| 22859 | VT.is128BitVector()) { | |||
| 22860 | assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget. hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__ __PRETTY_FUNCTION__)) | |||
| 22861 | "Unexpected subtarget!")(static_cast <bool> ((InVT == MVT::v16i64 || Subtarget. hasVLX()) && "Unexpected subtarget!") ? void (0) : __assert_fail ("(InVT == MVT::v16i64 || Subtarget.hasVLX()) && \"Unexpected subtarget!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22861, __extension__ __PRETTY_FUNCTION__)); | |||
| 22862 | // The default behavior is to truncate one step, concatenate, and then | |||
| 22863 | // truncate the remainder. We'd rather produce two 64-bit results and | |||
| 22864 | // concatenate those. | |||
| 22865 | SDValue Lo, Hi; | |||
| 22866 | std::tie(Lo, Hi) = DAG.SplitVector(In, DL); | |||
| 22867 | ||||
| 22868 | EVT LoVT, HiVT; | |||
| 22869 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 22870 | ||||
| 22871 | Lo = DAG.getNode(ISD::TRUNCATE, DL, LoVT, Lo); | |||
| 22872 | Hi = DAG.getNode(ISD::TRUNCATE, DL, HiVT, Hi); | |||
| 22873 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi); | |||
| 22874 | } | |||
| 22875 | ||||
| 22876 | // Otherwise let default legalization handle it. | |||
| 22877 | return SDValue(); | |||
| 22878 | } | |||
| 22879 | ||||
| 22880 | if (VT.getVectorElementType() == MVT::i1) | |||
| 22881 | return LowerTruncateVecI1(Op, DAG, Subtarget); | |||
| 22882 | ||||
| 22883 | // vpmovqb/w/d, vpmovdb/w, vpmovwb | |||
| 22884 | if (Subtarget.hasAVX512()) { | |||
| 22885 | if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) { | |||
| 22886 | assert(VT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (VT == MVT::v32i8 && "Unexpected VT!" ) ? void (0) : __assert_fail ("VT == MVT::v32i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22886, __extension__ __PRETTY_FUNCTION__)); | |||
| 22887 | return splitVectorIntUnary(Op, DAG); | |||
| 22888 | } | |||
| 22889 | ||||
| 22890 | // word to byte only under BWI. Otherwise we have to promoted to v16i32 | |||
| 22891 | // and then truncate that. But we should only do that if we haven't been | |||
| 22892 | // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be | |||
| 22893 | // handled by isel patterns. | |||
| 22894 | if (InVT != MVT::v16i16 || Subtarget.hasBWI() || | |||
| 22895 | Subtarget.canExtendTo512DQ()) | |||
| 22896 | return Op; | |||
| 22897 | } | |||
| 22898 | ||||
| 22899 | unsigned NumPackedSignBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16); | |||
| 22900 | unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; | |||
| 22901 | ||||
| 22902 | // Truncate with PACKUS if we are truncating a vector with leading zero bits | |||
| 22903 | // that extend all the way to the packed/truncated value. | |||
| 22904 | // Pre-SSE41 we can only use PACKUSWB. | |||
| 22905 | KnownBits Known = DAG.computeKnownBits(In); | |||
| 22906 | if ((InNumEltBits - NumPackedZeroBits) <= Known.countMinLeadingZeros()) | |||
| 22907 | if (SDValue V = | |||
| 22908 | truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) | |||
| 22909 | return V; | |||
| 22910 | ||||
| 22911 | // Truncate with PACKSS if we are truncating a vector with sign-bits that | |||
| 22912 | // extend all the way to the packed/truncated value. | |||
| 22913 | if ((InNumEltBits - NumPackedSignBits) < DAG.ComputeNumSignBits(In)) | |||
| 22914 | if (SDValue V = | |||
| 22915 | truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) | |||
| 22916 | return V; | |||
| 22917 | ||||
| 22918 | // Handle truncation of V256 to V128 using shuffles. | |||
| 22919 | assert(VT.is128BitVector() && InVT.is256BitVector() && "Unexpected types!")(static_cast <bool> (VT.is128BitVector() && InVT .is256BitVector() && "Unexpected types!") ? void (0) : __assert_fail ("VT.is128BitVector() && InVT.is256BitVector() && \"Unexpected types!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22919, __extension__ __PRETTY_FUNCTION__)); | |||
| 22920 | ||||
| 22921 | if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { | |||
| 22922 | // On AVX2, v4i64 -> v4i32 becomes VPERMD. | |||
| 22923 | if (Subtarget.hasInt256()) { | |||
| 22924 | static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1}; | |||
| 22925 | In = DAG.getBitcast(MVT::v8i32, In); | |||
| 22926 | In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask); | |||
| 22927 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In, | |||
| 22928 | DAG.getIntPtrConstant(0, DL)); | |||
| 22929 | } | |||
| 22930 | ||||
| 22931 | SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, | |||
| 22932 | DAG.getIntPtrConstant(0, DL)); | |||
| 22933 | SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, | |||
| 22934 | DAG.getIntPtrConstant(2, DL)); | |||
| 22935 | static const int ShufMask[] = {0, 2, 4, 6}; | |||
| 22936 | return DAG.getVectorShuffle(VT, DL, DAG.getBitcast(MVT::v4i32, OpLo), | |||
| 22937 | DAG.getBitcast(MVT::v4i32, OpHi), ShufMask); | |||
| 22938 | } | |||
| 22939 | ||||
| 22940 | if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) { | |||
| 22941 | // On AVX2, v8i32 -> v8i16 becomes PSHUFB. | |||
| 22942 | if (Subtarget.hasInt256()) { | |||
| 22943 | // The PSHUFB mask: | |||
| 22944 | static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13, | |||
| 22945 | -1, -1, -1, -1, -1, -1, -1, -1, | |||
| 22946 | 16, 17, 20, 21, 24, 25, 28, 29, | |||
| 22947 | -1, -1, -1, -1, -1, -1, -1, -1 }; | |||
| 22948 | In = DAG.getBitcast(MVT::v32i8, In); | |||
| 22949 | In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1); | |||
| 22950 | In = DAG.getBitcast(MVT::v4i64, In); | |||
| 22951 | ||||
| 22952 | static const int ShufMask2[] = {0, 2, -1, -1}; | |||
| 22953 | In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2); | |||
| 22954 | In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In, | |||
| 22955 | DAG.getIntPtrConstant(0, DL)); | |||
| 22956 | return DAG.getBitcast(MVT::v8i16, In); | |||
| 22957 | } | |||
| 22958 | ||||
| 22959 | SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, | |||
| 22960 | DAG.getIntPtrConstant(0, DL)); | |||
| 22961 | SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In, | |||
| 22962 | DAG.getIntPtrConstant(4, DL)); | |||
| 22963 | ||||
| 22964 | // The PSHUFB mask: | |||
| 22965 | static const int ShufMask1[] = {0, 2, 4, 6, -1, -1, -1, -1}; | |||
| 22966 | ||||
| 22967 | OpLo = DAG.getBitcast(MVT::v8i16, OpLo); | |||
| 22968 | OpHi = DAG.getBitcast(MVT::v8i16, OpHi); | |||
| 22969 | ||||
| 22970 | OpLo = DAG.getVectorShuffle(MVT::v8i16, DL, OpLo, OpLo, ShufMask1); | |||
| 22971 | OpHi = DAG.getVectorShuffle(MVT::v8i16, DL, OpHi, OpHi, ShufMask1); | |||
| 22972 | ||||
| 22973 | OpLo = DAG.getBitcast(MVT::v4i32, OpLo); | |||
| 22974 | OpHi = DAG.getBitcast(MVT::v4i32, OpHi); | |||
| 22975 | ||||
| 22976 | // The MOVLHPS Mask: | |||
| 22977 | static const int ShufMask2[] = {0, 1, 4, 5}; | |||
| 22978 | SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2); | |||
| 22979 | return DAG.getBitcast(MVT::v8i16, res); | |||
| 22980 | } | |||
| 22981 | ||||
| 22982 | if (VT == MVT::v16i8 && InVT == MVT::v16i16) { | |||
| 22983 | // Use an AND to zero uppper bits for PACKUS. | |||
| 22984 | In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(255, DL, InVT)); | |||
| 22985 | ||||
| 22986 | SDValue InLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, | |||
| 22987 | DAG.getIntPtrConstant(0, DL)); | |||
| 22988 | SDValue InHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v8i16, In, | |||
| 22989 | DAG.getIntPtrConstant(8, DL)); | |||
| 22990 | return DAG.getNode(X86ISD::PACKUS, DL, VT, InLo, InHi); | |||
| 22991 | } | |||
| 22992 | ||||
| 22993 | llvm_unreachable("All 256->128 cases should have been handled above!")::llvm::llvm_unreachable_internal("All 256->128 cases should have been handled above!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 22993); | |||
| 22994 | } | |||
| 22995 | ||||
| 22996 | // We can leverage the specific way the "cvttps2dq/cvttpd2dq" instruction | |||
| 22997 | // behaves on out of range inputs to generate optimized conversions. | |||
| 22998 | static SDValue expandFP_TO_UINT_SSE(MVT VT, SDValue Src, const SDLoc &dl, | |||
| 22999 | SelectionDAG &DAG, | |||
| 23000 | const X86Subtarget &Subtarget) { | |||
| 23001 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 23002 | unsigned DstBits = VT.getScalarSizeInBits(); | |||
| 23003 | assert(DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported")(static_cast <bool> (DstBits == 32 && "expandFP_TO_UINT_SSE - only vXi32 supported" ) ? void (0) : __assert_fail ("DstBits == 32 && \"expandFP_TO_UINT_SSE - only vXi32 supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23003, __extension__ __PRETTY_FUNCTION__)); | |||
| 23004 | ||||
| 23005 | // Calculate the converted result for values in the range 0 to | |||
| 23006 | // 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). | |||
| 23007 | SDValue Small = DAG.getNode(X86ISD::CVTTP2SI, dl, VT, Src); | |||
| 23008 | SDValue Big = | |||
| 23009 | DAG.getNode(X86ISD::CVTTP2SI, dl, VT, | |||
| 23010 | DAG.getNode(ISD::FSUB, dl, SrcVT, Src, | |||
| 23011 | DAG.getConstantFP(2147483648.0f, dl, SrcVT))); | |||
| 23012 | ||||
| 23013 | // The "CVTTP2SI" instruction conveniently sets the sign bit if | |||
| 23014 | // and only if the value was out of range. So we can use that | |||
| 23015 | // as our indicator that we rather use "Big" instead of "Small". | |||
| 23016 | // | |||
| 23017 | // Use "Small" if "IsOverflown" has all bits cleared | |||
| 23018 | // and "0x80000000 | Big" if all bits in "IsOverflown" are set. | |||
| 23019 | ||||
| 23020 | // AVX1 can't use the signsplat masking for 256-bit vectors - we have to | |||
| 23021 | // use the slightly slower blendv select instead. | |||
| 23022 | if (VT == MVT::v8i32 && !Subtarget.hasAVX2()) { | |||
| 23023 | SDValue Overflow = DAG.getNode(ISD::OR, dl, VT, Small, Big); | |||
| 23024 | return DAG.getNode(X86ISD::BLENDV, dl, VT, Small, Overflow, Small); | |||
| 23025 | } | |||
| 23026 | ||||
| 23027 | SDValue IsOverflown = | |||
| 23028 | DAG.getNode(X86ISD::VSRAI, dl, VT, Small, | |||
| 23029 | DAG.getTargetConstant(DstBits - 1, dl, MVT::i8)); | |||
| 23030 | return DAG.getNode(ISD::OR, dl, VT, Small, | |||
| 23031 | DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); | |||
| 23032 | } | |||
| 23033 | ||||
| 23034 | SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const { | |||
| 23035 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 23036 | bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT || | |||
| 23037 | Op.getOpcode() == ISD::STRICT_FP_TO_SINT; | |||
| 23038 | MVT VT = Op->getSimpleValueType(0); | |||
| 23039 | SDValue Src = Op.getOperand(IsStrict ? 1 : 0); | |||
| 23040 | SDValue Chain = IsStrict ? Op->getOperand(0) : SDValue(); | |||
| 23041 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 23042 | SDLoc dl(Op); | |||
| 23043 | ||||
| 23044 | SDValue Res; | |||
| 23045 | if (isSoftFP16(SrcVT)) { | |||
| 23046 | MVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; | |||
| 23047 | if (IsStrict) | |||
| 23048 | return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, | |||
| 23049 | {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, | |||
| 23050 | {NVT, MVT::Other}, {Chain, Src})}); | |||
| 23051 | return DAG.getNode(Op.getOpcode(), dl, VT, | |||
| 23052 | DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); | |||
| 23053 | } else if (isTypeLegal(SrcVT) && isLegalConversion(VT, IsSigned, Subtarget)) { | |||
| 23054 | return Op; | |||
| 23055 | } | |||
| 23056 | ||||
| 23057 | if (VT.isVector()) { | |||
| 23058 | if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) { | |||
| 23059 | MVT ResVT = MVT::v4i32; | |||
| 23060 | MVT TruncVT = MVT::v4i1; | |||
| 23061 | unsigned Opc; | |||
| 23062 | if (IsStrict) | |||
| 23063 | Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; | |||
| 23064 | else | |||
| 23065 | Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; | |||
| 23066 | ||||
| 23067 | if (!IsSigned && !Subtarget.hasVLX()) { | |||
| 23068 | assert(Subtarget.useAVX512Regs() && "Unexpected features!")(static_cast <bool> (Subtarget.useAVX512Regs() && "Unexpected features!") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Unexpected features!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23068, __extension__ __PRETTY_FUNCTION__)); | |||
| 23069 | // Widen to 512-bits. | |||
| 23070 | ResVT = MVT::v8i32; | |||
| 23071 | TruncVT = MVT::v8i1; | |||
| 23072 | Opc = Op.getOpcode(); | |||
| 23073 | // Need to concat with zero vector for strict fp to avoid spurious | |||
| 23074 | // exceptions. | |||
| 23075 | // TODO: Should we just do this for non-strict as well? | |||
| 23076 | SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64) | |||
| 23077 | : DAG.getUNDEF(MVT::v8f64); | |||
| 23078 | Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src, | |||
| 23079 | DAG.getIntPtrConstant(0, dl)); | |||
| 23080 | } | |||
| 23081 | if (IsStrict) { | |||
| 23082 | Res = DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Chain, Src}); | |||
| 23083 | Chain = Res.getValue(1); | |||
| 23084 | } else { | |||
| 23085 | Res = DAG.getNode(Opc, dl, ResVT, Src); | |||
| 23086 | } | |||
| 23087 | ||||
| 23088 | Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res); | |||
| 23089 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res, | |||
| 23090 | DAG.getIntPtrConstant(0, dl)); | |||
| 23091 | if (IsStrict) | |||
| 23092 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23093 | return Res; | |||
| 23094 | } | |||
| 23095 | ||||
| 23096 | if (Subtarget.hasFP16() && SrcVT.getVectorElementType() == MVT::f16) { | |||
| 23097 | if (VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16) | |||
| 23098 | return Op; | |||
| 23099 | ||||
| 23100 | MVT ResVT = VT; | |||
| 23101 | MVT EleVT = VT.getVectorElementType(); | |||
| 23102 | if (EleVT != MVT::i64) | |||
| 23103 | ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; | |||
| 23104 | ||||
| 23105 | if (SrcVT != MVT::v8f16) { | |||
| 23106 | SDValue Tmp = | |||
| 23107 | IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); | |||
| 23108 | SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); | |||
| 23109 | Ops[0] = Src; | |||
| 23110 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); | |||
| 23111 | } | |||
| 23112 | ||||
| 23113 | if (IsStrict) { | |||
| 23114 | Res = DAG.getNode(IsSigned ? X86ISD::STRICT_CVTTP2SI | |||
| 23115 | : X86ISD::STRICT_CVTTP2UI, | |||
| 23116 | dl, {ResVT, MVT::Other}, {Chain, Src}); | |||
| 23117 | Chain = Res.getValue(1); | |||
| 23118 | } else { | |||
| 23119 | Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, | |||
| 23120 | ResVT, Src); | |||
| 23121 | } | |||
| 23122 | ||||
| 23123 | // TODO: Need to add exception check code for strict FP. | |||
| 23124 | if (EleVT.getSizeInBits() < 16) { | |||
| 23125 | ResVT = MVT::getVectorVT(EleVT, 8); | |||
| 23126 | Res = DAG.getNode(ISD::TRUNCATE, dl, ResVT, Res); | |||
| 23127 | } | |||
| 23128 | ||||
| 23129 | if (ResVT != VT) | |||
| 23130 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, | |||
| 23131 | DAG.getIntPtrConstant(0, dl)); | |||
| 23132 | ||||
| 23133 | if (IsStrict) | |||
| 23134 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23135 | return Res; | |||
| 23136 | } | |||
| 23137 | ||||
| 23138 | // v8f32/v16f32/v8f64->v8i16/v16i16 need to widen first. | |||
| 23139 | if (VT.getVectorElementType() == MVT::i16) { | |||
| 23140 | assert((SrcVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((SrcVT.getVectorElementType() == MVT ::f32 || SrcVT.getVectorElementType() == MVT::f64) && "Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__ __PRETTY_FUNCTION__)) | |||
| 23141 | SrcVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((SrcVT.getVectorElementType() == MVT ::f32 || SrcVT.getVectorElementType() == MVT::f64) && "Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__ __PRETTY_FUNCTION__)) | |||
| 23142 | "Expected f32/f64 vector!")(static_cast <bool> ((SrcVT.getVectorElementType() == MVT ::f32 || SrcVT.getVectorElementType() == MVT::f64) && "Expected f32/f64 vector!") ? void (0) : __assert_fail ("(SrcVT.getVectorElementType() == MVT::f32 || SrcVT.getVectorElementType() == MVT::f64) && \"Expected f32/f64 vector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23142, __extension__ __PRETTY_FUNCTION__)); | |||
| 23143 | MVT NVT = VT.changeVectorElementType(MVT::i32); | |||
| 23144 | if (IsStrict) { | |||
| 23145 | Res = DAG.getNode(IsSigned ? ISD::STRICT_FP_TO_SINT | |||
| 23146 | : ISD::STRICT_FP_TO_UINT, | |||
| 23147 | dl, {NVT, MVT::Other}, {Chain, Src}); | |||
| 23148 | Chain = Res.getValue(1); | |||
| 23149 | } else { | |||
| 23150 | Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, | |||
| 23151 | NVT, Src); | |||
| 23152 | } | |||
| 23153 | ||||
| 23154 | // TODO: Need to add exception check code for strict FP. | |||
| 23155 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 23156 | ||||
| 23157 | if (IsStrict) | |||
| 23158 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23159 | return Res; | |||
| 23160 | } | |||
| 23161 | ||||
| 23162 | // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32. | |||
| 23163 | if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) { | |||
| 23164 | assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!" ) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23164, __extension__ __PRETTY_FUNCTION__)); | |||
| 23165 | assert(Subtarget.useAVX512Regs() && "Requires avx512f")(static_cast <bool> (Subtarget.useAVX512Regs() && "Requires avx512f") ? void (0) : __assert_fail ("Subtarget.useAVX512Regs() && \"Requires avx512f\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23165, __extension__ __PRETTY_FUNCTION__)); | |||
| 23166 | return Op; | |||
| 23167 | } | |||
| 23168 | ||||
| 23169 | // Widen vXi32 fp_to_uint with avx512f to 512-bit source. | |||
| 23170 | if ((VT == MVT::v4i32 || VT == MVT::v8i32) && | |||
| 23171 | (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32) && | |||
| 23172 | Subtarget.useAVX512Regs()) { | |||
| 23173 | assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!" ) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23173, __extension__ __PRETTY_FUNCTION__)); | |||
| 23174 | assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!" ) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23174, __extension__ __PRETTY_FUNCTION__)); | |||
| 23175 | MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32; | |||
| 23176 | MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32; | |||
| 23177 | // Need to concat with zero vector for strict fp to avoid spurious | |||
| 23178 | // exceptions. | |||
| 23179 | // TODO: Should we just do this for non-strict as well? | |||
| 23180 | SDValue Tmp = | |||
| 23181 | IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); | |||
| 23182 | Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, | |||
| 23183 | DAG.getIntPtrConstant(0, dl)); | |||
| 23184 | ||||
| 23185 | if (IsStrict) { | |||
| 23186 | Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other}, | |||
| 23187 | {Chain, Src}); | |||
| 23188 | Chain = Res.getValue(1); | |||
| 23189 | } else { | |||
| 23190 | Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src); | |||
| 23191 | } | |||
| 23192 | ||||
| 23193 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, | |||
| 23194 | DAG.getIntPtrConstant(0, dl)); | |||
| 23195 | ||||
| 23196 | if (IsStrict) | |||
| 23197 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23198 | return Res; | |||
| 23199 | } | |||
| 23200 | ||||
| 23201 | // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source. | |||
| 23202 | if ((VT == MVT::v2i64 || VT == MVT::v4i64) && | |||
| 23203 | (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32) && | |||
| 23204 | Subtarget.useAVX512Regs() && Subtarget.hasDQI()) { | |||
| 23205 | assert(!Subtarget.hasVLX() && "Unexpected features!")(static_cast <bool> (!Subtarget.hasVLX() && "Unexpected features!" ) ? void (0) : __assert_fail ("!Subtarget.hasVLX() && \"Unexpected features!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23205, __extension__ __PRETTY_FUNCTION__)); | |||
| 23206 | MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64; | |||
| 23207 | // Need to concat with zero vector for strict fp to avoid spurious | |||
| 23208 | // exceptions. | |||
| 23209 | // TODO: Should we just do this for non-strict as well? | |||
| 23210 | SDValue Tmp = | |||
| 23211 | IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT); | |||
| 23212 | Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src, | |||
| 23213 | DAG.getIntPtrConstant(0, dl)); | |||
| 23214 | ||||
| 23215 | if (IsStrict) { | |||
| 23216 | Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, | |||
| 23217 | {Chain, Src}); | |||
| 23218 | Chain = Res.getValue(1); | |||
| 23219 | } else { | |||
| 23220 | Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src); | |||
| 23221 | } | |||
| 23222 | ||||
| 23223 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res, | |||
| 23224 | DAG.getIntPtrConstant(0, dl)); | |||
| 23225 | ||||
| 23226 | if (IsStrict) | |||
| 23227 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23228 | return Res; | |||
| 23229 | } | |||
| 23230 | ||||
| 23231 | if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) { | |||
| 23232 | if (!Subtarget.hasVLX()) { | |||
| 23233 | // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type | |||
| 23234 | // legalizer and then widened again by vector op legalization. | |||
| 23235 | if (!IsStrict) | |||
| 23236 | return SDValue(); | |||
| 23237 | ||||
| 23238 | SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32); | |||
| 23239 | SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32, | |||
| 23240 | {Src, Zero, Zero, Zero}); | |||
| 23241 | Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other}, | |||
| 23242 | {Chain, Tmp}); | |||
| 23243 | SDValue Chain = Tmp.getValue(1); | |||
| 23244 | Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp, | |||
| 23245 | DAG.getIntPtrConstant(0, dl)); | |||
| 23246 | return DAG.getMergeValues({Tmp, Chain}, dl); | |||
| 23247 | } | |||
| 23248 | ||||
| 23249 | assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL")(static_cast <bool> (Subtarget.hasDQI() && Subtarget .hasVLX() && "Requires AVX512DQVL") ? void (0) : __assert_fail ("Subtarget.hasDQI() && Subtarget.hasVLX() && \"Requires AVX512DQVL\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23249, __extension__ __PRETTY_FUNCTION__)); | |||
| 23250 | SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, | |||
| 23251 | DAG.getUNDEF(MVT::v2f32)); | |||
| 23252 | if (IsStrict) { | |||
| 23253 | unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI | |||
| 23254 | : X86ISD::STRICT_CVTTP2UI; | |||
| 23255 | return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp}); | |||
| 23256 | } | |||
| 23257 | unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; | |||
| 23258 | return DAG.getNode(Opc, dl, VT, Tmp); | |||
| 23259 | } | |||
| 23260 | ||||
| 23261 | // Generate optimized instructions for pre AVX512 unsigned conversions from | |||
| 23262 | // vXf32 to vXi32. | |||
| 23263 | if ((VT == MVT::v4i32 && SrcVT == MVT::v4f32) || | |||
| 23264 | (VT == MVT::v4i32 && SrcVT == MVT::v4f64) || | |||
| 23265 | (VT == MVT::v8i32 && SrcVT == MVT::v8f32)) { | |||
| 23266 | assert(!IsSigned && "Expected unsigned conversion!")(static_cast <bool> (!IsSigned && "Expected unsigned conversion!" ) ? void (0) : __assert_fail ("!IsSigned && \"Expected unsigned conversion!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23266, __extension__ __PRETTY_FUNCTION__)); | |||
| 23267 | return expandFP_TO_UINT_SSE(VT, Src, dl, DAG, Subtarget); | |||
| 23268 | } | |||
| 23269 | ||||
| 23270 | return SDValue(); | |||
| 23271 | } | |||
| 23272 | ||||
| 23273 | assert(!VT.isVector())(static_cast <bool> (!VT.isVector()) ? void (0) : __assert_fail ("!VT.isVector()", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 23273, __extension__ __PRETTY_FUNCTION__)); | |||
| 23274 | ||||
| 23275 | bool UseSSEReg = isScalarFPTypeInSSEReg(SrcVT); | |||
| 23276 | ||||
| 23277 | if (!IsSigned && UseSSEReg) { | |||
| 23278 | // Conversions from f32/f64 with AVX512 should be legal. | |||
| 23279 | if (Subtarget.hasAVX512()) | |||
| 23280 | return Op; | |||
| 23281 | ||||
| 23282 | // We can leverage the specific way the "cvttss2si/cvttsd2si" instruction | |||
| 23283 | // behaves on out of range inputs to generate optimized conversions. | |||
| 23284 | if (!IsStrict && ((VT == MVT::i32 && !Subtarget.is64Bit()) || | |||
| 23285 | (VT == MVT::i64 && Subtarget.is64Bit()))) { | |||
| 23286 | unsigned DstBits = VT.getScalarSizeInBits(); | |||
| 23287 | APInt UIntLimit = APInt::getSignMask(DstBits); | |||
| 23288 | SDValue FloatOffset = DAG.getNode(ISD::UINT_TO_FP, dl, SrcVT, | |||
| 23289 | DAG.getConstant(UIntLimit, dl, VT)); | |||
| 23290 | MVT SrcVecVT = MVT::getVectorVT(SrcVT, 128 / SrcVT.getScalarSizeInBits()); | |||
| 23291 | ||||
| 23292 | // Calculate the converted result for values in the range: | |||
| 23293 | // (i32) 0 to 2^31-1 ("Small") and from 2^31 to 2^32-1 ("Big"). | |||
| 23294 | // (i64) 0 to 2^63-1 ("Small") and from 2^63 to 2^64-1 ("Big"). | |||
| 23295 | SDValue Small = | |||
| 23296 | DAG.getNode(X86ISD::CVTTS2SI, dl, VT, | |||
| 23297 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, Src)); | |||
| 23298 | SDValue Big = DAG.getNode( | |||
| 23299 | X86ISD::CVTTS2SI, dl, VT, | |||
| 23300 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, SrcVecVT, | |||
| 23301 | DAG.getNode(ISD::FSUB, dl, SrcVT, Src, FloatOffset))); | |||
| 23302 | ||||
| 23303 | // The "CVTTS2SI" instruction conveniently sets the sign bit if | |||
| 23304 | // and only if the value was out of range. So we can use that | |||
| 23305 | // as our indicator that we rather use "Big" instead of "Small". | |||
| 23306 | // | |||
| 23307 | // Use "Small" if "IsOverflown" has all bits cleared | |||
| 23308 | // and "0x80000000 | Big" if all bits in "IsOverflown" are set. | |||
| 23309 | SDValue IsOverflown = DAG.getNode( | |||
| 23310 | ISD::SRA, dl, VT, Small, DAG.getConstant(DstBits - 1, dl, MVT::i8)); | |||
| 23311 | return DAG.getNode(ISD::OR, dl, VT, Small, | |||
| 23312 | DAG.getNode(ISD::AND, dl, VT, Big, IsOverflown)); | |||
| 23313 | } | |||
| 23314 | ||||
| 23315 | // Use default expansion for i64. | |||
| 23316 | if (VT == MVT::i64) | |||
| 23317 | return SDValue(); | |||
| 23318 | ||||
| 23319 | assert(VT == MVT::i32 && "Unexpected VT!")(static_cast <bool> (VT == MVT::i32 && "Unexpected VT!" ) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23319, __extension__ __PRETTY_FUNCTION__)); | |||
| 23320 | ||||
| 23321 | // Promote i32 to i64 and use a signed operation on 64-bit targets. | |||
| 23322 | // FIXME: This does not generate an invalid exception if the input does not | |||
| 23323 | // fit in i32. PR44019 | |||
| 23324 | if (Subtarget.is64Bit()) { | |||
| 23325 | if (IsStrict) { | |||
| 23326 | Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i64, MVT::Other}, | |||
| 23327 | {Chain, Src}); | |||
| 23328 | Chain = Res.getValue(1); | |||
| 23329 | } else | |||
| 23330 | Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src); | |||
| 23331 | ||||
| 23332 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 23333 | if (IsStrict) | |||
| 23334 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23335 | return Res; | |||
| 23336 | } | |||
| 23337 | ||||
| 23338 | // Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can | |||
| 23339 | // use fisttp which will be handled later. | |||
| 23340 | if (!Subtarget.hasSSE3()) | |||
| 23341 | return SDValue(); | |||
| 23342 | } | |||
| 23343 | ||||
| 23344 | // Promote i16 to i32 if we can use a SSE operation or the type is f128. | |||
| 23345 | // FIXME: This does not generate an invalid exception if the input does not | |||
| 23346 | // fit in i16. PR44019 | |||
| 23347 | if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) { | |||
| 23348 | assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!")(static_cast <bool> (IsSigned && "Expected i16 FP_TO_UINT to have been promoted!" ) ? void (0) : __assert_fail ("IsSigned && \"Expected i16 FP_TO_UINT to have been promoted!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23348, __extension__ __PRETTY_FUNCTION__)); | |||
| 23349 | if (IsStrict) { | |||
| 23350 | Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {MVT::i32, MVT::Other}, | |||
| 23351 | {Chain, Src}); | |||
| 23352 | Chain = Res.getValue(1); | |||
| 23353 | } else | |||
| 23354 | Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src); | |||
| 23355 | ||||
| 23356 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 23357 | if (IsStrict) | |||
| 23358 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23359 | return Res; | |||
| 23360 | } | |||
| 23361 | ||||
| 23362 | // If this is a FP_TO_SINT using SSEReg we're done. | |||
| 23363 | if (UseSSEReg && IsSigned) | |||
| 23364 | return Op; | |||
| 23365 | ||||
| 23366 | // fp128 needs to use a libcall. | |||
| 23367 | if (SrcVT == MVT::f128) { | |||
| 23368 | RTLIB::Libcall LC; | |||
| 23369 | if (IsSigned) | |||
| 23370 | LC = RTLIB::getFPTOSINT(SrcVT, VT); | |||
| 23371 | else | |||
| 23372 | LC = RTLIB::getFPTOUINT(SrcVT, VT); | |||
| 23373 | ||||
| 23374 | MakeLibCallOptions CallOptions; | |||
| 23375 | std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions, | |||
| 23376 | SDLoc(Op), Chain); | |||
| 23377 | ||||
| 23378 | if (IsStrict) | |||
| 23379 | return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl); | |||
| 23380 | ||||
| 23381 | return Tmp.first; | |||
| 23382 | } | |||
| 23383 | ||||
| 23384 | // Fall back to X87. | |||
| 23385 | if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) { | |||
| 23386 | if (IsStrict) | |||
| 23387 | return DAG.getMergeValues({V, Chain}, dl); | |||
| 23388 | return V; | |||
| 23389 | } | |||
| 23390 | ||||
| 23391 | llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.")::llvm::llvm_unreachable_internal("Expected FP_TO_INTHelper to handle all remaining cases." , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23391); | |||
| 23392 | } | |||
| 23393 | ||||
| 23394 | SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op, | |||
| 23395 | SelectionDAG &DAG) const { | |||
| 23396 | SDValue Src = Op.getOperand(0); | |||
| 23397 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 23398 | ||||
| 23399 | if (SrcVT == MVT::f16) | |||
| 23400 | return SDValue(); | |||
| 23401 | ||||
| 23402 | // If the source is in an SSE register, the node is Legal. | |||
| 23403 | if (isScalarFPTypeInSSEReg(SrcVT)) | |||
| 23404 | return Op; | |||
| 23405 | ||||
| 23406 | return LRINT_LLRINTHelper(Op.getNode(), DAG); | |||
| 23407 | } | |||
| 23408 | ||||
| 23409 | SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N, | |||
| 23410 | SelectionDAG &DAG) const { | |||
| 23411 | EVT DstVT = N->getValueType(0); | |||
| 23412 | SDValue Src = N->getOperand(0); | |||
| 23413 | EVT SrcVT = Src.getValueType(); | |||
| 23414 | ||||
| 23415 | if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) { | |||
| 23416 | // f16 must be promoted before using the lowering in this routine. | |||
| 23417 | // fp128 does not use this lowering. | |||
| 23418 | return SDValue(); | |||
| 23419 | } | |||
| 23420 | ||||
| 23421 | SDLoc DL(N); | |||
| 23422 | SDValue Chain = DAG.getEntryNode(); | |||
| 23423 | ||||
| 23424 | bool UseSSE = isScalarFPTypeInSSEReg(SrcVT); | |||
| 23425 | ||||
| 23426 | // If we're converting from SSE, the stack slot needs to hold both types. | |||
| 23427 | // Otherwise it only needs to hold the DstVT. | |||
| 23428 | EVT OtherVT = UseSSE ? SrcVT : DstVT; | |||
| 23429 | SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT); | |||
| 23430 | int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); | |||
| 23431 | MachinePointerInfo MPI = | |||
| 23432 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); | |||
| 23433 | ||||
| 23434 | if (UseSSE) { | |||
| 23435 | assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!")(static_cast <bool> (DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!" ) ? void (0) : __assert_fail ("DstVT == MVT::i64 && \"Invalid LRINT/LLRINT to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23435, __extension__ __PRETTY_FUNCTION__)); | |||
| 23436 | Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI); | |||
| 23437 | SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 23438 | SDValue Ops[] = { Chain, StackPtr }; | |||
| 23439 | ||||
| 23440 | Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI, | |||
| 23441 | /*Align*/ std::nullopt, | |||
| 23442 | MachineMemOperand::MOLoad); | |||
| 23443 | Chain = Src.getValue(1); | |||
| 23444 | } | |||
| 23445 | ||||
| 23446 | SDValue StoreOps[] = { Chain, Src, StackPtr }; | |||
| 23447 | Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other), | |||
| 23448 | StoreOps, DstVT, MPI, /*Align*/ std::nullopt, | |||
| 23449 | MachineMemOperand::MOStore); | |||
| 23450 | ||||
| 23451 | return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI); | |||
| 23452 | } | |||
| 23453 | ||||
| 23454 | SDValue | |||
| 23455 | X86TargetLowering::LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const { | |||
| 23456 | // This is based on the TargetLowering::expandFP_TO_INT_SAT implementation, | |||
| 23457 | // but making use of X86 specifics to produce better instruction sequences. | |||
| 23458 | SDNode *Node = Op.getNode(); | |||
| 23459 | bool IsSigned = Node->getOpcode() == ISD::FP_TO_SINT_SAT; | |||
| 23460 | unsigned FpToIntOpcode = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT; | |||
| 23461 | SDLoc dl(SDValue(Node, 0)); | |||
| 23462 | SDValue Src = Node->getOperand(0); | |||
| 23463 | ||||
| 23464 | // There are three types involved here: SrcVT is the source floating point | |||
| 23465 | // type, DstVT is the type of the result, and TmpVT is the result of the | |||
| 23466 | // intermediate FP_TO_*INT operation we'll use (which may be a promotion of | |||
| 23467 | // DstVT). | |||
| 23468 | EVT SrcVT = Src.getValueType(); | |||
| 23469 | EVT DstVT = Node->getValueType(0); | |||
| 23470 | EVT TmpVT = DstVT; | |||
| 23471 | ||||
| 23472 | // This code is only for floats and doubles. Fall back to generic code for | |||
| 23473 | // anything else. | |||
| 23474 | if (!isScalarFPTypeInSSEReg(SrcVT) || isSoftFP16(SrcVT)) | |||
| 23475 | return SDValue(); | |||
| 23476 | ||||
| 23477 | EVT SatVT = cast<VTSDNode>(Node->getOperand(1))->getVT(); | |||
| 23478 | unsigned SatWidth = SatVT.getScalarSizeInBits(); | |||
| 23479 | unsigned DstWidth = DstVT.getScalarSizeInBits(); | |||
| 23480 | unsigned TmpWidth = TmpVT.getScalarSizeInBits(); | |||
| 23481 | assert(SatWidth <= DstWidth && SatWidth <= TmpWidth &&(static_cast <bool> (SatWidth <= DstWidth && SatWidth <= TmpWidth && "Expected saturation width smaller than result width" ) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__ __PRETTY_FUNCTION__)) | |||
| 23482 | "Expected saturation width smaller than result width")(static_cast <bool> (SatWidth <= DstWidth && SatWidth <= TmpWidth && "Expected saturation width smaller than result width" ) ? void (0) : __assert_fail ("SatWidth <= DstWidth && SatWidth <= TmpWidth && \"Expected saturation width smaller than result width\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23482, __extension__ __PRETTY_FUNCTION__)); | |||
| 23483 | ||||
| 23484 | // Promote result of FP_TO_*INT to at least 32 bits. | |||
| 23485 | if (TmpWidth < 32) { | |||
| 23486 | TmpVT = MVT::i32; | |||
| 23487 | TmpWidth = 32; | |||
| 23488 | } | |||
| 23489 | ||||
| 23490 | // Promote conversions to unsigned 32-bit to 64-bit, because it will allow | |||
| 23491 | // us to use a native signed conversion instead. | |||
| 23492 | if (SatWidth == 32 && !IsSigned && Subtarget.is64Bit()) { | |||
| 23493 | TmpVT = MVT::i64; | |||
| 23494 | TmpWidth = 64; | |||
| 23495 | } | |||
| 23496 | ||||
| 23497 | // If the saturation width is smaller than the size of the temporary result, | |||
| 23498 | // we can always use signed conversion, which is native. | |||
| 23499 | if (SatWidth < TmpWidth) | |||
| 23500 | FpToIntOpcode = ISD::FP_TO_SINT; | |||
| 23501 | ||||
| 23502 | // Determine minimum and maximum integer values and their corresponding | |||
| 23503 | // floating-point values. | |||
| 23504 | APInt MinInt, MaxInt; | |||
| 23505 | if (IsSigned) { | |||
| 23506 | MinInt = APInt::getSignedMinValue(SatWidth).sext(DstWidth); | |||
| 23507 | MaxInt = APInt::getSignedMaxValue(SatWidth).sext(DstWidth); | |||
| 23508 | } else { | |||
| 23509 | MinInt = APInt::getMinValue(SatWidth).zext(DstWidth); | |||
| 23510 | MaxInt = APInt::getMaxValue(SatWidth).zext(DstWidth); | |||
| 23511 | } | |||
| 23512 | ||||
| 23513 | APFloat MinFloat(DAG.EVTToAPFloatSemantics(SrcVT)); | |||
| 23514 | APFloat MaxFloat(DAG.EVTToAPFloatSemantics(SrcVT)); | |||
| 23515 | ||||
| 23516 | APFloat::opStatus MinStatus = MinFloat.convertFromAPInt( | |||
| 23517 | MinInt, IsSigned, APFloat::rmTowardZero); | |||
| 23518 | APFloat::opStatus MaxStatus = MaxFloat.convertFromAPInt( | |||
| 23519 | MaxInt, IsSigned, APFloat::rmTowardZero); | |||
| 23520 | bool AreExactFloatBounds = !(MinStatus & APFloat::opStatus::opInexact) | |||
| 23521 | && !(MaxStatus & APFloat::opStatus::opInexact); | |||
| 23522 | ||||
| 23523 | SDValue MinFloatNode = DAG.getConstantFP(MinFloat, dl, SrcVT); | |||
| 23524 | SDValue MaxFloatNode = DAG.getConstantFP(MaxFloat, dl, SrcVT); | |||
| 23525 | ||||
| 23526 | // If the integer bounds are exactly representable as floats, emit a | |||
| 23527 | // min+max+fptoi sequence. Otherwise use comparisons and selects. | |||
| 23528 | if (AreExactFloatBounds) { | |||
| 23529 | if (DstVT != TmpVT) { | |||
| 23530 | // Clamp by MinFloat from below. If Src is NaN, propagate NaN. | |||
| 23531 | SDValue MinClamped = DAG.getNode( | |||
| 23532 | X86ISD::FMAX, dl, SrcVT, MinFloatNode, Src); | |||
| 23533 | // Clamp by MaxFloat from above. If Src is NaN, propagate NaN. | |||
| 23534 | SDValue BothClamped = DAG.getNode( | |||
| 23535 | X86ISD::FMIN, dl, SrcVT, MaxFloatNode, MinClamped); | |||
| 23536 | // Convert clamped value to integer. | |||
| 23537 | SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, BothClamped); | |||
| 23538 | ||||
| 23539 | // NaN will become INDVAL, with the top bit set and the rest zero. | |||
| 23540 | // Truncation will discard the top bit, resulting in zero. | |||
| 23541 | return DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); | |||
| 23542 | } | |||
| 23543 | ||||
| 23544 | // Clamp by MinFloat from below. If Src is NaN, the result is MinFloat. | |||
| 23545 | SDValue MinClamped = DAG.getNode( | |||
| 23546 | X86ISD::FMAX, dl, SrcVT, Src, MinFloatNode); | |||
| 23547 | // Clamp by MaxFloat from above. NaN cannot occur. | |||
| 23548 | SDValue BothClamped = DAG.getNode( | |||
| 23549 | X86ISD::FMINC, dl, SrcVT, MinClamped, MaxFloatNode); | |||
| 23550 | // Convert clamped value to integer. | |||
| 23551 | SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, DstVT, BothClamped); | |||
| 23552 | ||||
| 23553 | if (!IsSigned) { | |||
| 23554 | // In the unsigned case we're done, because we mapped NaN to MinFloat, | |||
| 23555 | // which is zero. | |||
| 23556 | return FpToInt; | |||
| 23557 | } | |||
| 23558 | ||||
| 23559 | // Otherwise, select zero if Src is NaN. | |||
| 23560 | SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); | |||
| 23561 | return DAG.getSelectCC( | |||
| 23562 | dl, Src, Src, ZeroInt, FpToInt, ISD::CondCode::SETUO); | |||
| 23563 | } | |||
| 23564 | ||||
| 23565 | SDValue MinIntNode = DAG.getConstant(MinInt, dl, DstVT); | |||
| 23566 | SDValue MaxIntNode = DAG.getConstant(MaxInt, dl, DstVT); | |||
| 23567 | ||||
| 23568 | // Result of direct conversion, which may be selected away. | |||
| 23569 | SDValue FpToInt = DAG.getNode(FpToIntOpcode, dl, TmpVT, Src); | |||
| 23570 | ||||
| 23571 | if (DstVT != TmpVT) { | |||
| 23572 | // NaN will become INDVAL, with the top bit set and the rest zero. | |||
| 23573 | // Truncation will discard the top bit, resulting in zero. | |||
| 23574 | FpToInt = DAG.getNode(ISD::TRUNCATE, dl, DstVT, FpToInt); | |||
| 23575 | } | |||
| 23576 | ||||
| 23577 | SDValue Select = FpToInt; | |||
| 23578 | // For signed conversions where we saturate to the same size as the | |||
| 23579 | // result type of the fptoi instructions, INDVAL coincides with integer | |||
| 23580 | // minimum, so we don't need to explicitly check it. | |||
| 23581 | if (!IsSigned || SatWidth != TmpVT.getScalarSizeInBits()) { | |||
| 23582 | // If Src ULT MinFloat, select MinInt. In particular, this also selects | |||
| 23583 | // MinInt if Src is NaN. | |||
| 23584 | Select = DAG.getSelectCC( | |||
| 23585 | dl, Src, MinFloatNode, MinIntNode, Select, ISD::CondCode::SETULT); | |||
| 23586 | } | |||
| 23587 | ||||
| 23588 | // If Src OGT MaxFloat, select MaxInt. | |||
| 23589 | Select = DAG.getSelectCC( | |||
| 23590 | dl, Src, MaxFloatNode, MaxIntNode, Select, ISD::CondCode::SETOGT); | |||
| 23591 | ||||
| 23592 | // In the unsigned case we are done, because we mapped NaN to MinInt, which | |||
| 23593 | // is already zero. The promoted case was already handled above. | |||
| 23594 | if (!IsSigned || DstVT != TmpVT) { | |||
| 23595 | return Select; | |||
| 23596 | } | |||
| 23597 | ||||
| 23598 | // Otherwise, select 0 if Src is NaN. | |||
| 23599 | SDValue ZeroInt = DAG.getConstant(0, dl, DstVT); | |||
| 23600 | return DAG.getSelectCC( | |||
| 23601 | dl, Src, Src, ZeroInt, Select, ISD::CondCode::SETUO); | |||
| 23602 | } | |||
| 23603 | ||||
| 23604 | SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { | |||
| 23605 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 23606 | ||||
| 23607 | SDLoc DL(Op); | |||
| 23608 | MVT VT = Op.getSimpleValueType(); | |||
| 23609 | SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); | |||
| 23610 | SDValue In = Op.getOperand(IsStrict ? 1 : 0); | |||
| 23611 | MVT SVT = In.getSimpleValueType(); | |||
| 23612 | ||||
| 23613 | // Let f16->f80 get lowered to a libcall, except for darwin, where we should | |||
| 23614 | // lower it to an fp_extend via f32 (as only f16<>f32 libcalls are available) | |||
| 23615 | if (VT == MVT::f128 || (SVT == MVT::f16 && VT == MVT::f80 && | |||
| 23616 | !Subtarget.getTargetTriple().isOSDarwin())) | |||
| 23617 | return SDValue(); | |||
| 23618 | ||||
| 23619 | if (SVT == MVT::f16) { | |||
| 23620 | if (Subtarget.hasFP16()) | |||
| 23621 | return Op; | |||
| 23622 | ||||
| 23623 | if (VT != MVT::f32) { | |||
| 23624 | if (IsStrict) | |||
| 23625 | return DAG.getNode( | |||
| 23626 | ISD::STRICT_FP_EXTEND, DL, {VT, MVT::Other}, | |||
| 23627 | {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, DL, | |||
| 23628 | {MVT::f32, MVT::Other}, {Chain, In})}); | |||
| 23629 | ||||
| 23630 | return DAG.getNode(ISD::FP_EXTEND, DL, VT, | |||
| 23631 | DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, In)); | |||
| 23632 | } | |||
| 23633 | ||||
| 23634 | if (!Subtarget.hasF16C()) { | |||
| 23635 | if (!Subtarget.getTargetTriple().isOSDarwin()) | |||
| 23636 | return SDValue(); | |||
| 23637 | ||||
| 23638 | assert(VT == MVT::f32 && SVT == MVT::f16 && "unexpected extend libcall")(static_cast <bool> (VT == MVT::f32 && SVT == MVT ::f16 && "unexpected extend libcall") ? void (0) : __assert_fail ("VT == MVT::f32 && SVT == MVT::f16 && \"unexpected extend libcall\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23638, __extension__ __PRETTY_FUNCTION__)); | |||
| 23639 | ||||
| 23640 | // Need a libcall, but ABI for f16 is soft-float on MacOS. | |||
| 23641 | TargetLowering::CallLoweringInfo CLI(DAG); | |||
| 23642 | Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 23643 | ||||
| 23644 | In = DAG.getBitcast(MVT::i16, In); | |||
| 23645 | TargetLowering::ArgListTy Args; | |||
| 23646 | TargetLowering::ArgListEntry Entry; | |||
| 23647 | Entry.Node = In; | |||
| 23648 | Entry.Ty = EVT(MVT::i16).getTypeForEVT(*DAG.getContext()); | |||
| 23649 | Entry.IsSExt = false; | |||
| 23650 | Entry.IsZExt = true; | |||
| 23651 | Args.push_back(Entry); | |||
| 23652 | ||||
| 23653 | SDValue Callee = DAG.getExternalSymbol( | |||
| 23654 | getLibcallName(RTLIB::FPEXT_F16_F32), | |||
| 23655 | getPointerTy(DAG.getDataLayout())); | |||
| 23656 | CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( | |||
| 23657 | CallingConv::C, EVT(VT).getTypeForEVT(*DAG.getContext()), Callee, | |||
| 23658 | std::move(Args)); | |||
| 23659 | ||||
| 23660 | SDValue Res; | |||
| 23661 | std::tie(Res,Chain) = LowerCallTo(CLI); | |||
| 23662 | if (IsStrict) | |||
| 23663 | Res = DAG.getMergeValues({Res, Chain}, DL); | |||
| 23664 | ||||
| 23665 | return Res; | |||
| 23666 | } | |||
| 23667 | ||||
| 23668 | In = DAG.getBitcast(MVT::i16, In); | |||
| 23669 | In = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v8i16, | |||
| 23670 | getZeroVector(MVT::v8i16, Subtarget, DAG, DL), In, | |||
| 23671 | DAG.getIntPtrConstant(0, DL)); | |||
| 23672 | SDValue Res; | |||
| 23673 | if (IsStrict) { | |||
| 23674 | Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, DL, {MVT::v4f32, MVT::Other}, | |||
| 23675 | {Chain, In}); | |||
| 23676 | Chain = Res.getValue(1); | |||
| 23677 | } else { | |||
| 23678 | Res = DAG.getNode(X86ISD::CVTPH2PS, DL, MVT::v4f32, In, | |||
| 23679 | DAG.getTargetConstant(4, DL, MVT::i32)); | |||
| 23680 | } | |||
| 23681 | Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Res, | |||
| 23682 | DAG.getIntPtrConstant(0, DL)); | |||
| 23683 | if (IsStrict) | |||
| 23684 | return DAG.getMergeValues({Res, Chain}, DL); | |||
| 23685 | return Res; | |||
| 23686 | } | |||
| 23687 | ||||
| 23688 | if (!SVT.isVector()) | |||
| 23689 | return Op; | |||
| 23690 | ||||
| 23691 | if (SVT.getVectorElementType() == MVT::f16) { | |||
| 23692 | assert(Subtarget.hasF16C() && "Unexpected features!")(static_cast <bool> (Subtarget.hasF16C() && "Unexpected features!" ) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Unexpected features!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23692, __extension__ __PRETTY_FUNCTION__)); | |||
| 23693 | if (SVT == MVT::v2f16) | |||
| 23694 | In = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f16, In, | |||
| 23695 | DAG.getUNDEF(MVT::v2f16)); | |||
| 23696 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8f16, In, | |||
| 23697 | DAG.getUNDEF(MVT::v4f16)); | |||
| 23698 | if (IsStrict) | |||
| 23699 | return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, | |||
| 23700 | {Op->getOperand(0), Res}); | |||
| 23701 | return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); | |||
| 23702 | } else if (VT == MVT::v4f64 || VT == MVT::v8f64) { | |||
| 23703 | return Op; | |||
| 23704 | } | |||
| 23705 | ||||
| 23706 | assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!")(static_cast <bool> (SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!" ) ? void (0) : __assert_fail ("SVT == MVT::v2f32 && \"Only customize MVT::v2f32 type legalization!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23706, __extension__ __PRETTY_FUNCTION__)); | |||
| 23707 | ||||
| 23708 | SDValue Res = | |||
| 23709 | DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT)); | |||
| 23710 | if (IsStrict) | |||
| 23711 | return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other}, | |||
| 23712 | {Op->getOperand(0), Res}); | |||
| 23713 | return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res); | |||
| 23714 | } | |||
| 23715 | ||||
| 23716 | SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { | |||
| 23717 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 23718 | ||||
| 23719 | SDLoc DL(Op); | |||
| 23720 | SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); | |||
| 23721 | SDValue In = Op.getOperand(IsStrict ? 1 : 0); | |||
| 23722 | MVT VT = Op.getSimpleValueType(); | |||
| 23723 | MVT SVT = In.getSimpleValueType(); | |||
| 23724 | ||||
| 23725 | if (SVT == MVT::f128 || (VT == MVT::f16 && SVT == MVT::f80)) | |||
| 23726 | return SDValue(); | |||
| 23727 | ||||
| 23728 | if (VT == MVT::f16 && (SVT == MVT::f64 || SVT == MVT::f32) && | |||
| 23729 | !Subtarget.hasFP16() && (SVT == MVT::f64 || !Subtarget.hasF16C())) { | |||
| 23730 | if (!Subtarget.getTargetTriple().isOSDarwin()) | |||
| 23731 | return SDValue(); | |||
| 23732 | ||||
| 23733 | // We need a libcall but the ABI for f16 libcalls on MacOS is soft. | |||
| 23734 | TargetLowering::CallLoweringInfo CLI(DAG); | |||
| 23735 | Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 23736 | ||||
| 23737 | TargetLowering::ArgListTy Args; | |||
| 23738 | TargetLowering::ArgListEntry Entry; | |||
| 23739 | Entry.Node = In; | |||
| 23740 | Entry.Ty = EVT(SVT).getTypeForEVT(*DAG.getContext()); | |||
| 23741 | Entry.IsSExt = false; | |||
| 23742 | Entry.IsZExt = true; | |||
| 23743 | Args.push_back(Entry); | |||
| 23744 | ||||
| 23745 | SDValue Callee = DAG.getExternalSymbol( | |||
| 23746 | getLibcallName(SVT == MVT::f64 ? RTLIB::FPROUND_F64_F16 | |||
| 23747 | : RTLIB::FPROUND_F32_F16), | |||
| 23748 | getPointerTy(DAG.getDataLayout())); | |||
| 23749 | CLI.setDebugLoc(DL).setChain(Chain).setLibCallee( | |||
| 23750 | CallingConv::C, EVT(MVT::i16).getTypeForEVT(*DAG.getContext()), Callee, | |||
| 23751 | std::move(Args)); | |||
| 23752 | ||||
| 23753 | SDValue Res; | |||
| 23754 | std::tie(Res, Chain) = LowerCallTo(CLI); | |||
| 23755 | ||||
| 23756 | Res = DAG.getBitcast(MVT::f16, Res); | |||
| 23757 | ||||
| 23758 | if (IsStrict) | |||
| 23759 | Res = DAG.getMergeValues({Res, Chain}, DL); | |||
| 23760 | ||||
| 23761 | return Res; | |||
| 23762 | } | |||
| 23763 | ||||
| 23764 | if (VT.getScalarType() == MVT::f16 && !Subtarget.hasFP16()) { | |||
| 23765 | if (!Subtarget.hasF16C() || SVT.getScalarType() != MVT::f32) | |||
| 23766 | return SDValue(); | |||
| 23767 | ||||
| 23768 | if (VT.isVector()) | |||
| 23769 | return Op; | |||
| 23770 | ||||
| 23771 | SDValue Res; | |||
| 23772 | SDValue Rnd = DAG.getTargetConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, DL, | |||
| 23773 | MVT::i32); | |||
| 23774 | if (IsStrict) { | |||
| 23775 | Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4f32, | |||
| 23776 | DAG.getConstantFP(0, DL, MVT::v4f32), In, | |||
| 23777 | DAG.getIntPtrConstant(0, DL)); | |||
| 23778 | Res = DAG.getNode(X86ISD::STRICT_CVTPS2PH, DL, {MVT::v8i16, MVT::Other}, | |||
| 23779 | {Chain, Res, Rnd}); | |||
| 23780 | Chain = Res.getValue(1); | |||
| 23781 | } else { | |||
| 23782 | // FIXME: Should we use zeros for upper elements for non-strict? | |||
| 23783 | Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, In); | |||
| 23784 | Res = DAG.getNode(X86ISD::CVTPS2PH, DL, MVT::v8i16, Res, Rnd); | |||
| 23785 | } | |||
| 23786 | ||||
| 23787 | Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i16, Res, | |||
| 23788 | DAG.getIntPtrConstant(0, DL)); | |||
| 23789 | Res = DAG.getBitcast(MVT::f16, Res); | |||
| 23790 | ||||
| 23791 | if (IsStrict) | |||
| 23792 | return DAG.getMergeValues({Res, Chain}, DL); | |||
| 23793 | ||||
| 23794 | return Res; | |||
| 23795 | } | |||
| 23796 | ||||
| 23797 | return Op; | |||
| 23798 | } | |||
| 23799 | ||||
| 23800 | static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) { | |||
| 23801 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 23802 | SDValue Src = Op.getOperand(IsStrict ? 1 : 0); | |||
| 23803 | assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&(static_cast <bool> (Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && "Unexpected VT!") ? void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__ __PRETTY_FUNCTION__)) | |||
| 23804 | "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && "Unexpected VT!") ? void (0) : __assert_fail ("Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23804, __extension__ __PRETTY_FUNCTION__)); | |||
| 23805 | ||||
| 23806 | SDLoc dl(Op); | |||
| 23807 | SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, | |||
| 23808 | DAG.getConstant(0, dl, MVT::v8i16), Src, | |||
| 23809 | DAG.getIntPtrConstant(0, dl)); | |||
| 23810 | ||||
| 23811 | SDValue Chain; | |||
| 23812 | if (IsStrict) { | |||
| 23813 | Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other}, | |||
| 23814 | {Op.getOperand(0), Res}); | |||
| 23815 | Chain = Res.getValue(1); | |||
| 23816 | } else { | |||
| 23817 | Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); | |||
| 23818 | } | |||
| 23819 | ||||
| 23820 | Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, | |||
| 23821 | DAG.getIntPtrConstant(0, dl)); | |||
| 23822 | ||||
| 23823 | if (IsStrict) | |||
| 23824 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23825 | ||||
| 23826 | return Res; | |||
| 23827 | } | |||
| 23828 | ||||
| 23829 | static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) { | |||
| 23830 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 23831 | SDValue Src = Op.getOperand(IsStrict ? 1 : 0); | |||
| 23832 | assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&(static_cast <bool> (Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && "Unexpected VT!") ? void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__ __PRETTY_FUNCTION__)) | |||
| 23833 | "Unexpected VT!")(static_cast <bool> (Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && "Unexpected VT!") ? void (0) : __assert_fail ("Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23833, __extension__ __PRETTY_FUNCTION__)); | |||
| 23834 | ||||
| 23835 | SDLoc dl(Op); | |||
| 23836 | SDValue Res, Chain; | |||
| 23837 | if (IsStrict) { | |||
| 23838 | Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32, | |||
| 23839 | DAG.getConstantFP(0, dl, MVT::v4f32), Src, | |||
| 23840 | DAG.getIntPtrConstant(0, dl)); | |||
| 23841 | Res = DAG.getNode( | |||
| 23842 | X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, | |||
| 23843 | {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)}); | |||
| 23844 | Chain = Res.getValue(1); | |||
| 23845 | } else { | |||
| 23846 | // FIXME: Should we use zeros for upper elements for non-strict? | |||
| 23847 | Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src); | |||
| 23848 | Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, | |||
| 23849 | DAG.getTargetConstant(4, dl, MVT::i32)); | |||
| 23850 | } | |||
| 23851 | ||||
| 23852 | Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res, | |||
| 23853 | DAG.getIntPtrConstant(0, dl)); | |||
| 23854 | ||||
| 23855 | if (IsStrict) | |||
| 23856 | return DAG.getMergeValues({Res, Chain}, dl); | |||
| 23857 | ||||
| 23858 | return Res; | |||
| 23859 | } | |||
| 23860 | ||||
| 23861 | SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op, | |||
| 23862 | SelectionDAG &DAG) const { | |||
| 23863 | SDLoc DL(Op); | |||
| 23864 | MakeLibCallOptions CallOptions; | |||
| 23865 | RTLIB::Libcall LC = | |||
| 23866 | RTLIB::getFPROUND(Op.getOperand(0).getValueType(), MVT::bf16); | |||
| 23867 | SDValue Res = | |||
| 23868 | makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first; | |||
| 23869 | return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, | |||
| 23870 | DAG.getBitcast(MVT::i32, Res)); | |||
| 23871 | } | |||
| 23872 | ||||
| 23873 | /// Depending on uarch and/or optimizing for size, we might prefer to use a | |||
| 23874 | /// vector operation in place of the typical scalar operation. | |||
| 23875 | static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG, | |||
| 23876 | const X86Subtarget &Subtarget) { | |||
| 23877 | // If both operands have other uses, this is probably not profitable. | |||
| 23878 | SDValue LHS = Op.getOperand(0); | |||
| 23879 | SDValue RHS = Op.getOperand(1); | |||
| 23880 | if (!LHS.hasOneUse() && !RHS.hasOneUse()) | |||
| 23881 | return Op; | |||
| 23882 | ||||
| 23883 | // FP horizontal add/sub were added with SSE3. Integer with SSSE3. | |||
| 23884 | bool IsFP = Op.getSimpleValueType().isFloatingPoint(); | |||
| 23885 | if (IsFP && !Subtarget.hasSSE3()) | |||
| 23886 | return Op; | |||
| 23887 | if (!IsFP && !Subtarget.hasSSSE3()) | |||
| 23888 | return Op; | |||
| 23889 | ||||
| 23890 | // Extract from a common vector. | |||
| 23891 | if (LHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 23892 | RHS.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 23893 | LHS.getOperand(0) != RHS.getOperand(0) || | |||
| 23894 | !isa<ConstantSDNode>(LHS.getOperand(1)) || | |||
| 23895 | !isa<ConstantSDNode>(RHS.getOperand(1)) || | |||
| 23896 | !shouldUseHorizontalOp(true, DAG, Subtarget)) | |||
| 23897 | return Op; | |||
| 23898 | ||||
| 23899 | // Allow commuted 'hadd' ops. | |||
| 23900 | // TODO: Allow commuted (f)sub by negating the result of (F)HSUB? | |||
| 23901 | unsigned HOpcode; | |||
| 23902 | switch (Op.getOpcode()) { | |||
| 23903 | case ISD::ADD: HOpcode = X86ISD::HADD; break; | |||
| 23904 | case ISD::SUB: HOpcode = X86ISD::HSUB; break; | |||
| 23905 | case ISD::FADD: HOpcode = X86ISD::FHADD; break; | |||
| 23906 | case ISD::FSUB: HOpcode = X86ISD::FHSUB; break; | |||
| 23907 | default: | |||
| 23908 | llvm_unreachable("Trying to lower unsupported opcode to horizontal op")::llvm::llvm_unreachable_internal("Trying to lower unsupported opcode to horizontal op" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23908); | |||
| 23909 | } | |||
| 23910 | unsigned LExtIndex = LHS.getConstantOperandVal(1); | |||
| 23911 | unsigned RExtIndex = RHS.getConstantOperandVal(1); | |||
| 23912 | if ((LExtIndex & 1) == 1 && (RExtIndex & 1) == 0 && | |||
| 23913 | (HOpcode == X86ISD::HADD || HOpcode == X86ISD::FHADD)) | |||
| 23914 | std::swap(LExtIndex, RExtIndex); | |||
| 23915 | ||||
| 23916 | if ((LExtIndex & 1) != 0 || RExtIndex != (LExtIndex + 1)) | |||
| 23917 | return Op; | |||
| 23918 | ||||
| 23919 | SDValue X = LHS.getOperand(0); | |||
| 23920 | EVT VecVT = X.getValueType(); | |||
| 23921 | unsigned BitWidth = VecVT.getSizeInBits(); | |||
| 23922 | unsigned NumLanes = BitWidth / 128; | |||
| 23923 | unsigned NumEltsPerLane = VecVT.getVectorNumElements() / NumLanes; | |||
| 23924 | assert((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) &&(static_cast <bool> ((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here" ) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__ __PRETTY_FUNCTION__)) | |||
| 23925 | "Not expecting illegal vector widths here")(static_cast <bool> ((BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && "Not expecting illegal vector widths here" ) ? void (0) : __assert_fail ("(BitWidth == 128 || BitWidth == 256 || BitWidth == 512) && \"Not expecting illegal vector widths here\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23925, __extension__ __PRETTY_FUNCTION__)); | |||
| 23926 | ||||
| 23927 | // Creating a 256-bit horizontal op would be wasteful, and there is no 512-bit | |||
| 23928 | // equivalent, so extract the 256/512-bit source op to 128-bit if we can. | |||
| 23929 | SDLoc DL(Op); | |||
| 23930 | if (BitWidth == 256 || BitWidth == 512) { | |||
| 23931 | unsigned LaneIdx = LExtIndex / NumEltsPerLane; | |||
| 23932 | X = extract128BitVector(X, LaneIdx * NumEltsPerLane, DAG, DL); | |||
| 23933 | LExtIndex %= NumEltsPerLane; | |||
| 23934 | } | |||
| 23935 | ||||
| 23936 | // add (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hadd X, X), 0 | |||
| 23937 | // add (extractelt (X, 1), extractelt (X, 0)) --> extractelt (hadd X, X), 0 | |||
| 23938 | // add (extractelt (X, 2), extractelt (X, 3)) --> extractelt (hadd X, X), 1 | |||
| 23939 | // sub (extractelt (X, 0), extractelt (X, 1)) --> extractelt (hsub X, X), 0 | |||
| 23940 | SDValue HOp = DAG.getNode(HOpcode, DL, X.getValueType(), X, X); | |||
| 23941 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getSimpleValueType(), HOp, | |||
| 23942 | DAG.getIntPtrConstant(LExtIndex / 2, DL)); | |||
| 23943 | } | |||
| 23944 | ||||
| 23945 | /// Depending on uarch and/or optimizing for size, we might prefer to use a | |||
| 23946 | /// vector operation in place of the typical scalar operation. | |||
| 23947 | SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const { | |||
| 23948 | assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op .getValueType() == MVT::f64) && "Only expecting float/double" ) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__ __PRETTY_FUNCTION__)) | |||
| 23949 | "Only expecting float/double")(static_cast <bool> ((Op.getValueType() == MVT::f32 || Op .getValueType() == MVT::f64) && "Only expecting float/double" ) ? void (0) : __assert_fail ("(Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) && \"Only expecting float/double\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23949, __extension__ __PRETTY_FUNCTION__)); | |||
| 23950 | return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); | |||
| 23951 | } | |||
| 23952 | ||||
| 23953 | /// ISD::FROUND is defined to round to nearest with ties rounding away from 0. | |||
| 23954 | /// This mode isn't supported in hardware on X86. But as long as we aren't | |||
| 23955 | /// compiling with trapping math, we can emulate this with | |||
| 23956 | /// trunc(X + copysign(nextafter(0.5, 0.0), X)). | |||
| 23957 | static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) { | |||
| 23958 | SDValue N0 = Op.getOperand(0); | |||
| 23959 | SDLoc dl(Op); | |||
| 23960 | MVT VT = Op.getSimpleValueType(); | |||
| 23961 | ||||
| 23962 | // N0 += copysign(nextafter(0.5, 0.0), N0) | |||
| 23963 | const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); | |||
| 23964 | bool Ignored; | |||
| 23965 | APFloat Point5Pred = APFloat(0.5f); | |||
| 23966 | Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored); | |||
| 23967 | Point5Pred.next(/*nextDown*/true); | |||
| 23968 | ||||
| 23969 | SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT, | |||
| 23970 | DAG.getConstantFP(Point5Pred, dl, VT), N0); | |||
| 23971 | N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder); | |||
| 23972 | ||||
| 23973 | // Truncate the result to remove fraction. | |||
| 23974 | return DAG.getNode(ISD::FTRUNC, dl, VT, N0); | |||
| 23975 | } | |||
| 23976 | ||||
| 23977 | /// The only differences between FABS and FNEG are the mask and the logic op. | |||
| 23978 | /// FNEG also has a folding opportunity for FNEG(FABS(x)). | |||
| 23979 | static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) { | |||
| 23980 | assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op .getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG." ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__ __PRETTY_FUNCTION__)) | |||
| 23981 | "Wrong opcode for lowering FABS or FNEG.")(static_cast <bool> ((Op.getOpcode() == ISD::FABS || Op .getOpcode() == ISD::FNEG) && "Wrong opcode for lowering FABS or FNEG." ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) && \"Wrong opcode for lowering FABS or FNEG.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23981, __extension__ __PRETTY_FUNCTION__)); | |||
| 23982 | ||||
| 23983 | bool IsFABS = (Op.getOpcode() == ISD::FABS); | |||
| 23984 | ||||
| 23985 | // If this is a FABS and it has an FNEG user, bail out to fold the combination | |||
| 23986 | // into an FNABS. We'll lower the FABS after that if it is still in use. | |||
| 23987 | if (IsFABS) | |||
| 23988 | for (SDNode *User : Op->uses()) | |||
| 23989 | if (User->getOpcode() == ISD::FNEG) | |||
| 23990 | return Op; | |||
| 23991 | ||||
| 23992 | SDLoc dl(Op); | |||
| 23993 | MVT VT = Op.getSimpleValueType(); | |||
| 23994 | ||||
| 23995 | bool IsF128 = (VT == MVT::f128); | |||
| 23996 | assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFABSorFNEG") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__ __PRETTY_FUNCTION__)) | |||
| 23997 | DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFABSorFNEG") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__ __PRETTY_FUNCTION__)) | |||
| 23998 | "Unexpected type in LowerFABSorFNEG")(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFABSorFNEG") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFABSorFNEG\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 23998, __extension__ __PRETTY_FUNCTION__)); | |||
| 23999 | ||||
| 24000 | // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to | |||
| 24001 | // decide if we should generate a 16-byte constant mask when we only need 4 or | |||
| 24002 | // 8 bytes for the scalar case. | |||
| 24003 | ||||
| 24004 | // There are no scalar bitwise logical SSE/AVX instructions, so we | |||
| 24005 | // generate a 16-byte vector constant and logic op even for the scalar case. | |||
| 24006 | // Using a 16-byte mask allows folding the load of the mask with | |||
| 24007 | // the logic op, so it can save (~4 bytes) on code size. | |||
| 24008 | bool IsFakeVector = !VT.isVector() && !IsF128; | |||
| 24009 | MVT LogicVT = VT; | |||
| 24010 | if (IsFakeVector) | |||
| 24011 | LogicVT = (VT == MVT::f64) ? MVT::v2f64 | |||
| 24012 | : (VT == MVT::f32) ? MVT::v4f32 | |||
| 24013 | : MVT::v8f16; | |||
| 24014 | ||||
| 24015 | unsigned EltBits = VT.getScalarSizeInBits(); | |||
| 24016 | // For FABS, mask is 0x7f...; for FNEG, mask is 0x80... | |||
| 24017 | APInt MaskElt = IsFABS ? APInt::getSignedMaxValue(EltBits) : | |||
| 24018 | APInt::getSignMask(EltBits); | |||
| 24019 | const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); | |||
| 24020 | SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT); | |||
| 24021 | ||||
| 24022 | SDValue Op0 = Op.getOperand(0); | |||
| 24023 | bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS); | |||
| 24024 | unsigned LogicOp = IsFABS ? X86ISD::FAND : | |||
| 24025 | IsFNABS ? X86ISD::FOR : | |||
| 24026 | X86ISD::FXOR; | |||
| 24027 | SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0; | |||
| 24028 | ||||
| 24029 | if (VT.isVector() || IsF128) | |||
| 24030 | return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); | |||
| 24031 | ||||
| 24032 | // For the scalar case extend to a 128-bit vector, perform the logic op, | |||
| 24033 | // and extract the scalar result back out. | |||
| 24034 | Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand); | |||
| 24035 | SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask); | |||
| 24036 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode, | |||
| 24037 | DAG.getIntPtrConstant(0, dl)); | |||
| 24038 | } | |||
| 24039 | ||||
| 24040 | static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) { | |||
| 24041 | SDValue Mag = Op.getOperand(0); | |||
| 24042 | SDValue Sign = Op.getOperand(1); | |||
| 24043 | SDLoc dl(Op); | |||
| 24044 | ||||
| 24045 | // If the sign operand is smaller, extend it first. | |||
| 24046 | MVT VT = Op.getSimpleValueType(); | |||
| 24047 | if (Sign.getSimpleValueType().bitsLT(VT)) | |||
| 24048 | Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign); | |||
| 24049 | ||||
| 24050 | // And if it is bigger, shrink it first. | |||
| 24051 | if (Sign.getSimpleValueType().bitsGT(VT)) | |||
| 24052 | Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, | |||
| 24053 | DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); | |||
| 24054 | ||||
| 24055 | // At this point the operands and the result should have the same | |||
| 24056 | // type, and that won't be f80 since that is not custom lowered. | |||
| 24057 | bool IsF128 = (VT == MVT::f128); | |||
| 24058 | assert(VT.isFloatingPoint() && VT != MVT::f80 &&(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFCOPYSIGN") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__ __PRETTY_FUNCTION__)) | |||
| 24059 | DAG.getTargetLoweringInfo().isTypeLegal(VT) &&(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFCOPYSIGN") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__ __PRETTY_FUNCTION__)) | |||
| 24060 | "Unexpected type in LowerFCOPYSIGN")(static_cast <bool> (VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal (VT) && "Unexpected type in LowerFCOPYSIGN") ? void ( 0) : __assert_fail ("VT.isFloatingPoint() && VT != MVT::f80 && DAG.getTargetLoweringInfo().isTypeLegal(VT) && \"Unexpected type in LowerFCOPYSIGN\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24060, __extension__ __PRETTY_FUNCTION__)); | |||
| 24061 | ||||
| 24062 | const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT); | |||
| 24063 | ||||
| 24064 | // Perform all scalar logic operations as 16-byte vectors because there are no | |||
| 24065 | // scalar FP logic instructions in SSE. | |||
| 24066 | // TODO: This isn't necessary. If we used scalar types, we might avoid some | |||
| 24067 | // unnecessary splats, but we might miss load folding opportunities. Should | |||
| 24068 | // this decision be based on OptimizeForSize? | |||
| 24069 | bool IsFakeVector = !VT.isVector() && !IsF128; | |||
| 24070 | MVT LogicVT = VT; | |||
| 24071 | if (IsFakeVector) | |||
| 24072 | LogicVT = (VT == MVT::f64) ? MVT::v2f64 | |||
| 24073 | : (VT == MVT::f32) ? MVT::v4f32 | |||
| 24074 | : MVT::v8f16; | |||
| 24075 | ||||
| 24076 | // The mask constants are automatically splatted for vector types. | |||
| 24077 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 24078 | SDValue SignMask = DAG.getConstantFP( | |||
| 24079 | APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT); | |||
| 24080 | SDValue MagMask = DAG.getConstantFP( | |||
| 24081 | APFloat(Sem, APInt::getSignedMaxValue(EltSizeInBits)), dl, LogicVT); | |||
| 24082 | ||||
| 24083 | // First, clear all bits but the sign bit from the second operand (sign). | |||
| 24084 | if (IsFakeVector) | |||
| 24085 | Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign); | |||
| 24086 | SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask); | |||
| 24087 | ||||
| 24088 | // Next, clear the sign bit from the first operand (magnitude). | |||
| 24089 | // TODO: If we had general constant folding for FP logic ops, this check | |||
| 24090 | // wouldn't be necessary. | |||
| 24091 | SDValue MagBits; | |||
| 24092 | if (ConstantFPSDNode *Op0CN = isConstOrConstSplatFP(Mag)) { | |||
| 24093 | APFloat APF = Op0CN->getValueAPF(); | |||
| 24094 | APF.clearSign(); | |||
| 24095 | MagBits = DAG.getConstantFP(APF, dl, LogicVT); | |||
| 24096 | } else { | |||
| 24097 | // If the magnitude operand wasn't a constant, we need to AND out the sign. | |||
| 24098 | if (IsFakeVector) | |||
| 24099 | Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag); | |||
| 24100 | MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask); | |||
| 24101 | } | |||
| 24102 | ||||
| 24103 | // OR the magnitude value with the sign bit. | |||
| 24104 | SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit); | |||
| 24105 | return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or, | |||
| 24106 | DAG.getIntPtrConstant(0, dl)); | |||
| 24107 | } | |||
| 24108 | ||||
| 24109 | static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) { | |||
| 24110 | SDValue N0 = Op.getOperand(0); | |||
| 24111 | SDLoc dl(Op); | |||
| 24112 | MVT VT = Op.getSimpleValueType(); | |||
| 24113 | ||||
| 24114 | MVT OpVT = N0.getSimpleValueType(); | |||
| 24115 | assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT:: f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__ __PRETTY_FUNCTION__)) | |||
| 24116 | "Unexpected type for FGETSIGN")(static_cast <bool> ((OpVT == MVT::f32 || OpVT == MVT:: f64) && "Unexpected type for FGETSIGN") ? void (0) : __assert_fail ("(OpVT == MVT::f32 || OpVT == MVT::f64) && \"Unexpected type for FGETSIGN\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24116, __extension__ __PRETTY_FUNCTION__)); | |||
| 24117 | ||||
| 24118 | // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1). | |||
| 24119 | MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64); | |||
| 24120 | SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0); | |||
| 24121 | Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res); | |||
| 24122 | Res = DAG.getZExtOrTrunc(Res, dl, VT); | |||
| 24123 | Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT)); | |||
| 24124 | return Res; | |||
| 24125 | } | |||
| 24126 | ||||
| 24127 | /// Helper for attempting to create a X86ISD::BT node. | |||
| 24128 | static SDValue getBT(SDValue Src, SDValue BitNo, const SDLoc &DL, SelectionDAG &DAG) { | |||
| 24129 | // If Src is i8, promote it to i32 with any_extend. There is no i8 BT | |||
| 24130 | // instruction. Since the shift amount is in-range-or-undefined, we know | |||
| 24131 | // that doing a bittest on the i32 value is ok. We extend to i32 because | |||
| 24132 | // the encoding for the i16 version is larger than the i32 version. | |||
| 24133 | // Also promote i16 to i32 for performance / code size reason. | |||
| 24134 | if (Src.getValueType().getScalarSizeInBits() < 32) | |||
| 24135 | Src = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src); | |||
| 24136 | ||||
| 24137 | // No legal type found, give up. | |||
| 24138 | if (!DAG.getTargetLoweringInfo().isTypeLegal(Src.getValueType())) | |||
| 24139 | return SDValue(); | |||
| 24140 | ||||
| 24141 | // See if we can use the 32-bit instruction instead of the 64-bit one for a | |||
| 24142 | // shorter encoding. Since the former takes the modulo 32 of BitNo and the | |||
| 24143 | // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is | |||
| 24144 | // known to be zero. | |||
| 24145 | if (Src.getValueType() == MVT::i64 && | |||
| 24146 | DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32))) | |||
| 24147 | Src = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src); | |||
| 24148 | ||||
| 24149 | // If the operand types disagree, extend the shift amount to match. Since | |||
| 24150 | // BT ignores high bits (like shifts) we can use anyextend. | |||
| 24151 | if (Src.getValueType() != BitNo.getValueType()) { | |||
| 24152 | // Peek through a mask/modulo operation. | |||
| 24153 | // TODO: DAGCombine fails to do this as it just checks isTruncateFree, but | |||
| 24154 | // we probably need a better IsDesirableToPromoteOp to handle this as well. | |||
| 24155 | if (BitNo.getOpcode() == ISD::AND && BitNo->hasOneUse()) | |||
| 24156 | BitNo = DAG.getNode(ISD::AND, DL, Src.getValueType(), | |||
| 24157 | DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), | |||
| 24158 | BitNo.getOperand(0)), | |||
| 24159 | DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), | |||
| 24160 | BitNo.getOperand(1))); | |||
| 24161 | else | |||
| 24162 | BitNo = DAG.getNode(ISD::ANY_EXTEND, DL, Src.getValueType(), BitNo); | |||
| 24163 | } | |||
| 24164 | ||||
| 24165 | return DAG.getNode(X86ISD::BT, DL, MVT::i32, Src, BitNo); | |||
| 24166 | } | |||
| 24167 | ||||
| 24168 | /// Helper for creating a X86ISD::SETCC node. | |||
| 24169 | static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl, | |||
| 24170 | SelectionDAG &DAG) { | |||
| 24171 | return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, | |||
| 24172 | DAG.getTargetConstant(Cond, dl, MVT::i8), EFLAGS); | |||
| 24173 | } | |||
| 24174 | ||||
| 24175 | /// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a | |||
| 24176 | /// recognizable memcmp expansion. | |||
| 24177 | static bool isOrXorXorTree(SDValue X, bool Root = true) { | |||
| 24178 | if (X.getOpcode() == ISD::OR) | |||
| 24179 | return isOrXorXorTree(X.getOperand(0), false) && | |||
| 24180 | isOrXorXorTree(X.getOperand(1), false); | |||
| 24181 | if (Root) | |||
| 24182 | return false; | |||
| 24183 | return X.getOpcode() == ISD::XOR; | |||
| 24184 | } | |||
| 24185 | ||||
| 24186 | /// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp | |||
| 24187 | /// expansion. | |||
| 24188 | template <typename F> | |||
| 24189 | static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, | |||
| 24190 | EVT VecVT, EVT CmpVT, bool HasPT, F SToV) { | |||
| 24191 | SDValue Op0 = X.getOperand(0); | |||
| 24192 | SDValue Op1 = X.getOperand(1); | |||
| 24193 | if (X.getOpcode() == ISD::OR) { | |||
| 24194 | SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV); | |||
| 24195 | SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV); | |||
| 24196 | if (VecVT != CmpVT) | |||
| 24197 | return DAG.getNode(ISD::OR, DL, CmpVT, A, B); | |||
| 24198 | if (HasPT) | |||
| 24199 | return DAG.getNode(ISD::OR, DL, VecVT, A, B); | |||
| 24200 | return DAG.getNode(ISD::AND, DL, CmpVT, A, B); | |||
| 24201 | } | |||
| 24202 | if (X.getOpcode() == ISD::XOR) { | |||
| 24203 | SDValue A = SToV(Op0); | |||
| 24204 | SDValue B = SToV(Op1); | |||
| 24205 | if (VecVT != CmpVT) | |||
| 24206 | return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE); | |||
| 24207 | if (HasPT) | |||
| 24208 | return DAG.getNode(ISD::XOR, DL, VecVT, A, B); | |||
| 24209 | return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ); | |||
| 24210 | } | |||
| 24211 | llvm_unreachable("Impossible")::llvm::llvm_unreachable_internal("Impossible", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 24211); | |||
| 24212 | } | |||
| 24213 | ||||
| 24214 | /// Try to map a 128-bit or larger integer comparison to vector instructions | |||
| 24215 | /// before type legalization splits it up into chunks. | |||
| 24216 | static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, | |||
| 24217 | ISD::CondCode CC, | |||
| 24218 | const SDLoc &DL, | |||
| 24219 | SelectionDAG &DAG, | |||
| 24220 | const X86Subtarget &Subtarget) { | |||
| 24221 | assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate")(static_cast <bool> ((CC == ISD::SETNE || CC == ISD::SETEQ ) && "Bad comparison predicate") ? void (0) : __assert_fail ("(CC == ISD::SETNE || CC == ISD::SETEQ) && \"Bad comparison predicate\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24221, __extension__ __PRETTY_FUNCTION__)); | |||
| 24222 | ||||
| 24223 | // We're looking for an oversized integer equality comparison. | |||
| 24224 | EVT OpVT = X.getValueType(); | |||
| 24225 | unsigned OpSize = OpVT.getSizeInBits(); | |||
| 24226 | if (!OpVT.isScalarInteger() || OpSize < 128) | |||
| 24227 | return SDValue(); | |||
| 24228 | ||||
| 24229 | // Ignore a comparison with zero because that gets special treatment in | |||
| 24230 | // EmitTest(). But make an exception for the special case of a pair of | |||
| 24231 | // logically-combined vector-sized operands compared to zero. This pattern may | |||
| 24232 | // be generated by the memcmp expansion pass with oversized integer compares | |||
| 24233 | // (see PR33325). | |||
| 24234 | bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); | |||
| 24235 | if (isNullConstant(Y) && !IsOrXorXorTreeCCZero) | |||
| 24236 | return SDValue(); | |||
| 24237 | ||||
| 24238 | // Don't perform this combine if constructing the vector will be expensive. | |||
| 24239 | auto IsVectorBitCastCheap = [](SDValue X) { | |||
| 24240 | X = peekThroughBitcasts(X); | |||
| 24241 | return isa<ConstantSDNode>(X) || X.getValueType().isVector() || | |||
| 24242 | X.getOpcode() == ISD::LOAD; | |||
| 24243 | }; | |||
| 24244 | if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && | |||
| 24245 | !IsOrXorXorTreeCCZero) | |||
| 24246 | return SDValue(); | |||
| 24247 | ||||
| 24248 | // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands. | |||
| 24249 | // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands. | |||
| 24250 | // Otherwise use PCMPEQ (plus AND) and mask testing. | |||
| 24251 | bool NoImplicitFloatOps = | |||
| 24252 | DAG.getMachineFunction().getFunction().hasFnAttribute( | |||
| 24253 | Attribute::NoImplicitFloat); | |||
| 24254 | if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps && | |||
| 24255 | ((OpSize == 128 && Subtarget.hasSSE2()) || | |||
| 24256 | (OpSize == 256 && Subtarget.hasAVX()) || | |||
| 24257 | (OpSize == 512 && Subtarget.useAVX512Regs()))) { | |||
| 24258 | bool HasPT = Subtarget.hasSSE41(); | |||
| 24259 | ||||
| 24260 | // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened | |||
| 24261 | // vector registers are essentially free. (Technically, widening registers | |||
| 24262 | // prevents load folding, but the tradeoff is worth it.) | |||
| 24263 | bool PreferKOT = Subtarget.preferMaskRegisters(); | |||
| 24264 | bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512; | |||
| 24265 | ||||
| 24266 | EVT VecVT = MVT::v16i8; | |||
| 24267 | EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT; | |||
| 24268 | if (OpSize == 256) { | |||
| 24269 | VecVT = MVT::v32i8; | |||
| 24270 | CmpVT = PreferKOT ? MVT::v32i1 : VecVT; | |||
| 24271 | } | |||
| 24272 | EVT CastVT = VecVT; | |||
| 24273 | bool NeedsAVX512FCast = false; | |||
| 24274 | if (OpSize == 512 || NeedZExt) { | |||
| 24275 | if (Subtarget.hasBWI()) { | |||
| 24276 | VecVT = MVT::v64i8; | |||
| 24277 | CmpVT = MVT::v64i1; | |||
| 24278 | if (OpSize == 512) | |||
| 24279 | CastVT = VecVT; | |||
| 24280 | } else { | |||
| 24281 | VecVT = MVT::v16i32; | |||
| 24282 | CmpVT = MVT::v16i1; | |||
| 24283 | CastVT = OpSize == 512 ? VecVT | |||
| 24284 | : OpSize == 256 ? MVT::v8i32 | |||
| 24285 | : MVT::v4i32; | |||
| 24286 | NeedsAVX512FCast = true; | |||
| 24287 | } | |||
| 24288 | } | |||
| 24289 | ||||
| 24290 | auto ScalarToVector = [&](SDValue X) -> SDValue { | |||
| 24291 | bool TmpZext = false; | |||
| 24292 | EVT TmpCastVT = CastVT; | |||
| 24293 | if (X.getOpcode() == ISD::ZERO_EXTEND) { | |||
| 24294 | SDValue OrigX = X.getOperand(0); | |||
| 24295 | unsigned OrigSize = OrigX.getScalarValueSizeInBits(); | |||
| 24296 | if (OrigSize < OpSize) { | |||
| 24297 | if (OrigSize == 128) { | |||
| 24298 | TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8; | |||
| 24299 | X = OrigX; | |||
| 24300 | TmpZext = true; | |||
| 24301 | } else if (OrigSize == 256) { | |||
| 24302 | TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8; | |||
| 24303 | X = OrigX; | |||
| 24304 | TmpZext = true; | |||
| 24305 | } | |||
| 24306 | } | |||
| 24307 | } | |||
| 24308 | X = DAG.getBitcast(TmpCastVT, X); | |||
| 24309 | if (!NeedZExt && !TmpZext) | |||
| 24310 | return X; | |||
| 24311 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT, | |||
| 24312 | DAG.getConstant(0, DL, VecVT), X, | |||
| 24313 | DAG.getVectorIdxConstant(0, DL)); | |||
| 24314 | }; | |||
| 24315 | ||||
| 24316 | SDValue Cmp; | |||
| 24317 | if (IsOrXorXorTreeCCZero) { | |||
| 24318 | // This is a bitwise-combined equality comparison of 2 pairs of vectors: | |||
| 24319 | // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne | |||
| 24320 | // Use 2 vector equality compares and 'and' the results before doing a | |||
| 24321 | // MOVMSK. | |||
| 24322 | Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector); | |||
| 24323 | } else { | |||
| 24324 | SDValue VecX = ScalarToVector(X); | |||
| 24325 | SDValue VecY = ScalarToVector(Y); | |||
| 24326 | if (VecVT != CmpVT) { | |||
| 24327 | Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE); | |||
| 24328 | } else if (HasPT) { | |||
| 24329 | Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); | |||
| 24330 | } else { | |||
| 24331 | Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ); | |||
| 24332 | } | |||
| 24333 | } | |||
| 24334 | // AVX512 should emit a setcc that will lower to kortest. | |||
| 24335 | if (VecVT != CmpVT) { | |||
| 24336 | EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 | |||
| 24337 | : CmpVT == MVT::v32i1 ? MVT::i32 | |||
| 24338 | : MVT::i16; | |||
| 24339 | return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), | |||
| 24340 | DAG.getConstant(0, DL, KRegVT), CC); | |||
| 24341 | } | |||
| 24342 | if (HasPT) { | |||
| 24343 | SDValue BCCmp = | |||
| 24344 | DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64, Cmp); | |||
| 24345 | SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp); | |||
| 24346 | X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; | |||
| 24347 | SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG); | |||
| 24348 | return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0)); | |||
| 24349 | } | |||
| 24350 | // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality. | |||
| 24351 | // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq | |||
| 24352 | // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne | |||
| 24353 | assert(Cmp.getValueType() == MVT::v16i8 &&(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__ __PRETTY_FUNCTION__)) | |||
| 24354 | "Non 128-bit vector on pre-SSE41 target")(static_cast <bool> (Cmp.getValueType() == MVT::v16i8 && "Non 128-bit vector on pre-SSE41 target") ? void (0) : __assert_fail ("Cmp.getValueType() == MVT::v16i8 && \"Non 128-bit vector on pre-SSE41 target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24354, __extension__ __PRETTY_FUNCTION__)); | |||
| 24355 | SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp); | |||
| 24356 | SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32); | |||
| 24357 | return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC); | |||
| 24358 | } | |||
| 24359 | ||||
| 24360 | return SDValue(); | |||
| 24361 | } | |||
| 24362 | ||||
| 24363 | /// Helper for matching BINOP(EXTRACTELT(X,0),BINOP(EXTRACTELT(X,1),...)) | |||
| 24364 | /// style scalarized (associative) reduction patterns. Partial reductions | |||
| 24365 | /// are supported when the pointer SrcMask is non-null. | |||
| 24366 | /// TODO - move this to SelectionDAG? | |||
| 24367 | static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp, | |||
| 24368 | SmallVectorImpl<SDValue> &SrcOps, | |||
| 24369 | SmallVectorImpl<APInt> *SrcMask = nullptr) { | |||
| 24370 | SmallVector<SDValue, 8> Opnds; | |||
| 24371 | DenseMap<SDValue, APInt> SrcOpMap; | |||
| 24372 | EVT VT = MVT::Other; | |||
| 24373 | ||||
| 24374 | // Recognize a special case where a vector is casted into wide integer to | |||
| 24375 | // test all 0s. | |||
| 24376 | assert(Op.getOpcode() == unsigned(BinOp) &&(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode") ? void (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__ __PRETTY_FUNCTION__)) | |||
| 24377 | "Unexpected bit reduction opcode")(static_cast <bool> (Op.getOpcode() == unsigned(BinOp) && "Unexpected bit reduction opcode") ? void (0) : __assert_fail ("Op.getOpcode() == unsigned(BinOp) && \"Unexpected bit reduction opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24377, __extension__ __PRETTY_FUNCTION__)); | |||
| 24378 | Opnds.push_back(Op.getOperand(0)); | |||
| 24379 | Opnds.push_back(Op.getOperand(1)); | |||
| 24380 | ||||
| 24381 | for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) { | |||
| 24382 | SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot; | |||
| 24383 | // BFS traverse all BinOp operands. | |||
| 24384 | if (I->getOpcode() == unsigned(BinOp)) { | |||
| 24385 | Opnds.push_back(I->getOperand(0)); | |||
| 24386 | Opnds.push_back(I->getOperand(1)); | |||
| 24387 | // Re-evaluate the number of nodes to be traversed. | |||
| 24388 | e += 2; // 2 more nodes (LHS and RHS) are pushed. | |||
| 24389 | continue; | |||
| 24390 | } | |||
| 24391 | ||||
| 24392 | // Quit if a non-EXTRACT_VECTOR_ELT | |||
| 24393 | if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 24394 | return false; | |||
| 24395 | ||||
| 24396 | // Quit if without a constant index. | |||
| 24397 | auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1)); | |||
| 24398 | if (!Idx) | |||
| 24399 | return false; | |||
| 24400 | ||||
| 24401 | SDValue Src = I->getOperand(0); | |||
| 24402 | DenseMap<SDValue, APInt>::iterator M = SrcOpMap.find(Src); | |||
| 24403 | if (M == SrcOpMap.end()) { | |||
| 24404 | VT = Src.getValueType(); | |||
| 24405 | // Quit if not the same type. | |||
| 24406 | if (!SrcOpMap.empty() && VT != SrcOpMap.begin()->first.getValueType()) | |||
| 24407 | return false; | |||
| 24408 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 24409 | APInt EltCount = APInt::getZero(NumElts); | |||
| 24410 | M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first; | |||
| 24411 | SrcOps.push_back(Src); | |||
| 24412 | } | |||
| 24413 | ||||
| 24414 | // Quit if element already used. | |||
| 24415 | unsigned CIdx = Idx->getZExtValue(); | |||
| 24416 | if (M->second[CIdx]) | |||
| 24417 | return false; | |||
| 24418 | M->second.setBit(CIdx); | |||
| 24419 | } | |||
| 24420 | ||||
| 24421 | if (SrcMask) { | |||
| 24422 | // Collect the source partial masks. | |||
| 24423 | for (SDValue &SrcOp : SrcOps) | |||
| 24424 | SrcMask->push_back(SrcOpMap[SrcOp]); | |||
| 24425 | } else { | |||
| 24426 | // Quit if not all elements are used. | |||
| 24427 | for (const auto &I : SrcOpMap) | |||
| 24428 | if (!I.second.isAllOnes()) | |||
| 24429 | return false; | |||
| 24430 | } | |||
| 24431 | ||||
| 24432 | return true; | |||
| 24433 | } | |||
| 24434 | ||||
| 24435 | // Helper function for comparing all bits of two vectors. | |||
| 24436 | static SDValue LowerVectorAllEqual(const SDLoc &DL, SDValue LHS, SDValue RHS, | |||
| 24437 | ISD::CondCode CC, const APInt &OriginalMask, | |||
| 24438 | const X86Subtarget &Subtarget, | |||
| 24439 | SelectionDAG &DAG, X86::CondCode &X86CC) { | |||
| 24440 | EVT VT = LHS.getValueType(); | |||
| 24441 | unsigned ScalarSize = VT.getScalarSizeInBits(); | |||
| 24442 | if (OriginalMask.getBitWidth() != ScalarSize) { | |||
| 24443 | assert(ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch")(static_cast <bool> (ScalarSize == 1 && "Element Mask vs Vector bitwidth mismatch" ) ? void (0) : __assert_fail ("ScalarSize == 1 && \"Element Mask vs Vector bitwidth mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24443, __extension__ __PRETTY_FUNCTION__)); | |||
| 24444 | return SDValue(); | |||
| 24445 | } | |||
| 24446 | ||||
| 24447 | // Quit if not convertable to legal scalar or 128/256-bit vector. | |||
| 24448 | if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits())) | |||
| 24449 | return SDValue(); | |||
| 24450 | ||||
| 24451 | // FCMP may use ISD::SETNE when nnan - early out if we manage to get here. | |||
| 24452 | if (VT.isFloatingPoint()) | |||
| 24453 | return SDValue(); | |||
| 24454 | ||||
| 24455 | assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE ) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24455, __extension__ __PRETTY_FUNCTION__)); | |||
| 24456 | X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE); | |||
| 24457 | ||||
| 24458 | APInt Mask = OriginalMask; | |||
| 24459 | ||||
| 24460 | auto MaskBits = [&](SDValue Src) { | |||
| 24461 | if (Mask.isAllOnes()) | |||
| 24462 | return Src; | |||
| 24463 | EVT SrcVT = Src.getValueType(); | |||
| 24464 | SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT); | |||
| 24465 | return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue); | |||
| 24466 | }; | |||
| 24467 | ||||
| 24468 | // For sub-128-bit vector, cast to (legal) integer and compare with zero. | |||
| 24469 | if (VT.getSizeInBits() < 128) { | |||
| 24470 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); | |||
| 24471 | if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT)) { | |||
| 24472 | if (IntVT != MVT::i64) | |||
| 24473 | return SDValue(); | |||
| 24474 | auto SplitLHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(LHS)), DL, | |||
| 24475 | MVT::i32, MVT::i32); | |||
| 24476 | auto SplitRHS = DAG.SplitScalar(DAG.getBitcast(IntVT, MaskBits(RHS)), DL, | |||
| 24477 | MVT::i32, MVT::i32); | |||
| 24478 | SDValue Lo = | |||
| 24479 | DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.first, SplitRHS.first); | |||
| 24480 | SDValue Hi = | |||
| 24481 | DAG.getNode(ISD::XOR, DL, MVT::i32, SplitLHS.second, SplitRHS.second); | |||
| 24482 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, | |||
| 24483 | DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi), | |||
| 24484 | DAG.getConstant(0, DL, MVT::i32)); | |||
| 24485 | } | |||
| 24486 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, | |||
| 24487 | DAG.getBitcast(IntVT, MaskBits(LHS)), | |||
| 24488 | DAG.getBitcast(IntVT, MaskBits(RHS))); | |||
| 24489 | } | |||
| 24490 | ||||
| 24491 | // Without PTEST, a masked v2i64 or-reduction is not faster than | |||
| 24492 | // scalarization. | |||
| 24493 | bool UseKORTEST = Subtarget.useAVX512Regs(); | |||
| 24494 | bool UsePTEST = Subtarget.hasSSE41(); | |||
| 24495 | if (!UsePTEST && !Mask.isAllOnes() && ScalarSize > 32) | |||
| 24496 | return SDValue(); | |||
| 24497 | ||||
| 24498 | // Split down to 128/256/512-bit vector. | |||
| 24499 | unsigned TestSize = UseKORTEST ? 512 : (Subtarget.hasAVX() ? 256 : 128); | |||
| 24500 | ||||
| 24501 | // If the input vector has vector elements wider than the target test size, | |||
| 24502 | // then cast to <X x i64> so it will safely split. | |||
| 24503 | if (ScalarSize > TestSize) { | |||
| 24504 | if (!Mask.isAllOnes()) | |||
| 24505 | return SDValue(); | |||
| 24506 | VT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, VT.getSizeInBits() / 64); | |||
| 24507 | LHS = DAG.getBitcast(VT, LHS); | |||
| 24508 | RHS = DAG.getBitcast(VT, RHS); | |||
| 24509 | Mask = APInt::getAllOnes(64); | |||
| 24510 | } | |||
| 24511 | ||||
| 24512 | if (VT.getSizeInBits() > TestSize) { | |||
| 24513 | KnownBits KnownRHS = DAG.computeKnownBits(RHS); | |||
| 24514 | if (KnownRHS.isConstant() && KnownRHS.getConstant() == Mask) { | |||
| 24515 | // If ICMP(AND(LHS,MASK),MASK) - reduce using AND splits. | |||
| 24516 | while (VT.getSizeInBits() > TestSize) { | |||
| 24517 | auto Split = DAG.SplitVector(LHS, DL); | |||
| 24518 | VT = Split.first.getValueType(); | |||
| 24519 | LHS = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second); | |||
| 24520 | } | |||
| 24521 | RHS = DAG.getAllOnesConstant(DL, VT); | |||
| 24522 | } else if (!UsePTEST && !KnownRHS.isZero()) { | |||
| 24523 | // MOVMSK Special Case: | |||
| 24524 | // ALLOF(CMPEQ(X,Y)) -> AND(CMPEQ(X[0],Y[0]),CMPEQ(X[1],Y[1]),....) | |||
| 24525 | MVT SVT = ScalarSize >= 32 ? MVT::i32 : MVT::i8; | |||
| 24526 | VT = MVT::getVectorVT(SVT, VT.getSizeInBits() / SVT.getSizeInBits()); | |||
| 24527 | LHS = DAG.getBitcast(VT, MaskBits(LHS)); | |||
| 24528 | RHS = DAG.getBitcast(VT, MaskBits(RHS)); | |||
| 24529 | EVT BoolVT = VT.changeVectorElementType(MVT::i1); | |||
| 24530 | SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETEQ); | |||
| 24531 | V = DAG.getSExtOrTrunc(V, DL, VT); | |||
| 24532 | while (VT.getSizeInBits() > TestSize) { | |||
| 24533 | auto Split = DAG.SplitVector(V, DL); | |||
| 24534 | VT = Split.first.getValueType(); | |||
| 24535 | V = DAG.getNode(ISD::AND, DL, VT, Split.first, Split.second); | |||
| 24536 | } | |||
| 24537 | V = DAG.getNOT(DL, V, VT); | |||
| 24538 | V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); | |||
| 24539 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, | |||
| 24540 | DAG.getConstant(0, DL, MVT::i32)); | |||
| 24541 | } else { | |||
| 24542 | // Convert to a ICMP_EQ(XOR(LHS,RHS),0) pattern. | |||
| 24543 | SDValue V = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS); | |||
| 24544 | while (VT.getSizeInBits() > TestSize) { | |||
| 24545 | auto Split = DAG.SplitVector(V, DL); | |||
| 24546 | VT = Split.first.getValueType(); | |||
| 24547 | V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second); | |||
| 24548 | } | |||
| 24549 | LHS = V; | |||
| 24550 | RHS = DAG.getConstant(0, DL, VT); | |||
| 24551 | } | |||
| 24552 | } | |||
| 24553 | ||||
| 24554 | if (UseKORTEST && VT.is512BitVector()) { | |||
| 24555 | MVT TestVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); | |||
| 24556 | MVT BoolVT = TestVT.changeVectorElementType(MVT::i1); | |||
| 24557 | LHS = DAG.getBitcast(TestVT, MaskBits(LHS)); | |||
| 24558 | RHS = DAG.getBitcast(TestVT, MaskBits(RHS)); | |||
| 24559 | SDValue V = DAG.getSetCC(DL, BoolVT, LHS, RHS, ISD::SETNE); | |||
| 24560 | return DAG.getNode(X86ISD::KORTEST, DL, MVT::i32, V, V); | |||
| 24561 | } | |||
| 24562 | ||||
| 24563 | if (UsePTEST) { | |||
| 24564 | MVT TestVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64); | |||
| 24565 | LHS = DAG.getBitcast(TestVT, MaskBits(LHS)); | |||
| 24566 | RHS = DAG.getBitcast(TestVT, MaskBits(RHS)); | |||
| 24567 | SDValue V = DAG.getNode(ISD::XOR, DL, TestVT, LHS, RHS); | |||
| 24568 | return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V); | |||
| 24569 | } | |||
| 24570 | ||||
| 24571 | assert(VT.getSizeInBits() == 128 && "Failure to split to 128-bits")(static_cast <bool> (VT.getSizeInBits() == 128 && "Failure to split to 128-bits") ? void (0) : __assert_fail ( "VT.getSizeInBits() == 128 && \"Failure to split to 128-bits\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24571, __extension__ __PRETTY_FUNCTION__)); | |||
| 24572 | MVT MaskVT = ScalarSize >= 32 ? MVT::v4i32 : MVT::v16i8; | |||
| 24573 | LHS = DAG.getBitcast(MaskVT, MaskBits(LHS)); | |||
| 24574 | RHS = DAG.getBitcast(MaskVT, MaskBits(RHS)); | |||
| 24575 | SDValue V = DAG.getNode(X86ISD::PCMPEQ, DL, MaskVT, LHS, RHS); | |||
| 24576 | V = DAG.getNOT(DL, V, MaskVT); | |||
| 24577 | V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); | |||
| 24578 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V, | |||
| 24579 | DAG.getConstant(0, DL, MVT::i32)); | |||
| 24580 | } | |||
| 24581 | ||||
| 24582 | // Check whether an AND/OR'd reduction tree is PTEST-able, or if we can fallback | |||
| 24583 | // to CMP(MOVMSK(PCMPEQB(X,Y))). | |||
| 24584 | static SDValue MatchVectorAllEqualTest(SDValue LHS, SDValue RHS, | |||
| 24585 | ISD::CondCode CC, const SDLoc &DL, | |||
| 24586 | const X86Subtarget &Subtarget, | |||
| 24587 | SelectionDAG &DAG, | |||
| 24588 | X86::CondCode &X86CC) { | |||
| 24589 | assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE ) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24589, __extension__ __PRETTY_FUNCTION__)); | |||
| 24590 | ||||
| 24591 | bool CmpNull = isNullConstant(RHS); | |||
| 24592 | bool CmpAllOnes = isAllOnesConstant(RHS); | |||
| 24593 | if (!CmpNull && !CmpAllOnes) | |||
| 24594 | return SDValue(); | |||
| 24595 | ||||
| 24596 | SDValue Op = LHS; | |||
| 24597 | if (!Subtarget.hasSSE2() || !Op->hasOneUse()) | |||
| 24598 | return SDValue(); | |||
| 24599 | ||||
| 24600 | // Check whether we're masking/truncating an OR-reduction result, in which | |||
| 24601 | // case track the masked bits. | |||
| 24602 | // TODO: Add CmpAllOnes support. | |||
| 24603 | APInt Mask = APInt::getAllOnes(Op.getScalarValueSizeInBits()); | |||
| 24604 | if (CmpNull) { | |||
| 24605 | switch (Op.getOpcode()) { | |||
| 24606 | case ISD::TRUNCATE: { | |||
| 24607 | SDValue Src = Op.getOperand(0); | |||
| 24608 | Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(), | |||
| 24609 | Op.getScalarValueSizeInBits()); | |||
| 24610 | Op = Src; | |||
| 24611 | break; | |||
| 24612 | } | |||
| 24613 | case ISD::AND: { | |||
| 24614 | if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { | |||
| 24615 | Mask = Cst->getAPIntValue(); | |||
| 24616 | Op = Op.getOperand(0); | |||
| 24617 | } | |||
| 24618 | break; | |||
| 24619 | } | |||
| 24620 | } | |||
| 24621 | } | |||
| 24622 | ||||
| 24623 | ISD::NodeType LogicOp = CmpNull ? ISD::OR : ISD::AND; | |||
| 24624 | ||||
| 24625 | // Match icmp(or(extract(X,0),extract(X,1)),0) anyof reduction patterns. | |||
| 24626 | // Match icmp(and(extract(X,0),extract(X,1)),-1) allof reduction patterns. | |||
| 24627 | SmallVector<SDValue, 8> VecIns; | |||
| 24628 | if (Op.getOpcode() == LogicOp && matchScalarReduction(Op, LogicOp, VecIns)) { | |||
| 24629 | EVT VT = VecIns[0].getValueType(); | |||
| 24630 | assert(llvm::all_of(VecIns,(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V ) { return VT == V.getValueType(); }) && "Reduction source vector mismatch" ) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__ __PRETTY_FUNCTION__)) | |||
| 24631 | [VT](SDValue V) { return VT == V.getValueType(); }) &&(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V ) { return VT == V.getValueType(); }) && "Reduction source vector mismatch" ) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__ __PRETTY_FUNCTION__)) | |||
| 24632 | "Reduction source vector mismatch")(static_cast <bool> (llvm::all_of(VecIns, [VT](SDValue V ) { return VT == V.getValueType(); }) && "Reduction source vector mismatch" ) ? void (0) : __assert_fail ("llvm::all_of(VecIns, [VT](SDValue V) { return VT == V.getValueType(); }) && \"Reduction source vector mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24632, __extension__ __PRETTY_FUNCTION__)); | |||
| 24633 | ||||
| 24634 | // Quit if not splittable to scalar/128/256/512-bit vector. | |||
| 24635 | if (!llvm::has_single_bit<uint32_t>(VT.getSizeInBits())) | |||
| 24636 | return SDValue(); | |||
| 24637 | ||||
| 24638 | // If more than one full vector is evaluated, AND/OR them first before | |||
| 24639 | // PTEST. | |||
| 24640 | for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; | |||
| 24641 | Slot += 2, e += 1) { | |||
| 24642 | // Each iteration will AND/OR 2 nodes and append the result until there is | |||
| 24643 | // only 1 node left, i.e. the final value of all vectors. | |||
| 24644 | SDValue LHS = VecIns[Slot]; | |||
| 24645 | SDValue RHS = VecIns[Slot + 1]; | |||
| 24646 | VecIns.push_back(DAG.getNode(LogicOp, DL, VT, LHS, RHS)); | |||
| 24647 | } | |||
| 24648 | ||||
| 24649 | return LowerVectorAllEqual(DL, VecIns.back(), | |||
| 24650 | CmpNull ? DAG.getConstant(0, DL, VT) | |||
| 24651 | : DAG.getAllOnesConstant(DL, VT), | |||
| 24652 | CC, Mask, Subtarget, DAG, X86CC); | |||
| 24653 | } | |||
| 24654 | ||||
| 24655 | // Match icmp(reduce_or(X),0) anyof reduction patterns. | |||
| 24656 | // Match icmp(reduce_and(X),-1) allof reduction patterns. | |||
| 24657 | if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) { | |||
| 24658 | ISD::NodeType BinOp; | |||
| 24659 | if (SDValue Match = | |||
| 24660 | DAG.matchBinOpReduction(Op.getNode(), BinOp, {LogicOp})) { | |||
| 24661 | EVT MatchVT = Match.getValueType(); | |||
| 24662 | return LowerVectorAllEqual(DL, Match, | |||
| 24663 | CmpNull ? DAG.getConstant(0, DL, MatchVT) | |||
| 24664 | : DAG.getAllOnesConstant(DL, MatchVT), | |||
| 24665 | CC, Mask, Subtarget, DAG, X86CC); | |||
| 24666 | } | |||
| 24667 | } | |||
| 24668 | ||||
| 24669 | if (Mask.isAllOnes()) { | |||
| 24670 | assert(!Op.getValueType().isVector() &&(static_cast <bool> (!Op.getValueType().isVector() && "Illegal vector type for reduction pattern") ? void (0) : __assert_fail ("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__ __PRETTY_FUNCTION__)) | |||
| 24671 | "Illegal vector type for reduction pattern")(static_cast <bool> (!Op.getValueType().isVector() && "Illegal vector type for reduction pattern") ? void (0) : __assert_fail ("!Op.getValueType().isVector() && \"Illegal vector type for reduction pattern\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24671, __extension__ __PRETTY_FUNCTION__)); | |||
| 24672 | SDValue Src = peekThroughBitcasts(Op); | |||
| 24673 | if (Src.getValueType().isFixedLengthVector() && | |||
| 24674 | Src.getValueType().getScalarType() == MVT::i1) { | |||
| 24675 | // Match icmp(bitcast(icmp_ne(X,Y)),0) reduction patterns. | |||
| 24676 | // Match icmp(bitcast(icmp_eq(X,Y)),-1) reduction patterns. | |||
| 24677 | if (Src.getOpcode() == ISD::SETCC) { | |||
| 24678 | SDValue LHS = Src.getOperand(0); | |||
| 24679 | SDValue RHS = Src.getOperand(1); | |||
| 24680 | EVT LHSVT = LHS.getValueType(); | |||
| 24681 | ISD::CondCode SrcCC = cast<CondCodeSDNode>(Src.getOperand(2))->get(); | |||
| 24682 | if (SrcCC == (CmpNull ? ISD::SETNE : ISD::SETEQ) && | |||
| 24683 | llvm::has_single_bit<uint32_t>(LHSVT.getSizeInBits())) { | |||
| 24684 | APInt SrcMask = APInt::getAllOnes(LHSVT.getScalarSizeInBits()); | |||
| 24685 | return LowerVectorAllEqual(DL, LHS, RHS, CC, SrcMask, Subtarget, DAG, | |||
| 24686 | X86CC); | |||
| 24687 | } | |||
| 24688 | } | |||
| 24689 | // Match icmp(bitcast(vXi1 trunc(Y)),0) reduction patterns. | |||
| 24690 | // Match icmp(bitcast(vXi1 trunc(Y)),-1) reduction patterns. | |||
| 24691 | // Peek through truncation, mask the LSB and compare against zero/LSB. | |||
| 24692 | if (Src.getOpcode() == ISD::TRUNCATE) { | |||
| 24693 | SDValue Inner = Src.getOperand(0); | |||
| 24694 | EVT InnerVT = Inner.getValueType(); | |||
| 24695 | if (llvm::has_single_bit<uint32_t>(InnerVT.getSizeInBits())) { | |||
| 24696 | unsigned BW = InnerVT.getScalarSizeInBits(); | |||
| 24697 | APInt SrcMask = APInt(BW, 1); | |||
| 24698 | APInt Cmp = CmpNull ? APInt::getZero(BW) : SrcMask; | |||
| 24699 | return LowerVectorAllEqual(DL, Inner, | |||
| 24700 | DAG.getConstant(Cmp, DL, InnerVT), CC, | |||
| 24701 | SrcMask, Subtarget, DAG, X86CC); | |||
| 24702 | } | |||
| 24703 | } | |||
| 24704 | } | |||
| 24705 | } | |||
| 24706 | ||||
| 24707 | return SDValue(); | |||
| 24708 | } | |||
| 24709 | ||||
| 24710 | /// return true if \c Op has a use that doesn't just read flags. | |||
| 24711 | static bool hasNonFlagsUse(SDValue Op) { | |||
| 24712 | for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE; | |||
| 24713 | ++UI) { | |||
| 24714 | SDNode *User = *UI; | |||
| 24715 | unsigned UOpNo = UI.getOperandNo(); | |||
| 24716 | if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) { | |||
| 24717 | // Look pass truncate. | |||
| 24718 | UOpNo = User->use_begin().getOperandNo(); | |||
| 24719 | User = *User->use_begin(); | |||
| 24720 | } | |||
| 24721 | ||||
| 24722 | if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC && | |||
| 24723 | !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) | |||
| 24724 | return true; | |||
| 24725 | } | |||
| 24726 | return false; | |||
| 24727 | } | |||
| 24728 | ||||
| 24729 | // Transform to an x86-specific ALU node with flags if there is a chance of | |||
| 24730 | // using an RMW op or only the flags are used. Otherwise, leave | |||
| 24731 | // the node alone and emit a 'cmp' or 'test' instruction. | |||
| 24732 | static bool isProfitableToUseFlagOp(SDValue Op) { | |||
| 24733 | for (SDNode *U : Op->uses()) | |||
| 24734 | if (U->getOpcode() != ISD::CopyToReg && | |||
| 24735 | U->getOpcode() != ISD::SETCC && | |||
| 24736 | U->getOpcode() != ISD::STORE) | |||
| 24737 | return false; | |||
| 24738 | ||||
| 24739 | return true; | |||
| 24740 | } | |||
| 24741 | ||||
| 24742 | /// Emit nodes that will be selected as "test Op0,Op0", or something | |||
| 24743 | /// equivalent. | |||
| 24744 | static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, | |||
| 24745 | SelectionDAG &DAG, const X86Subtarget &Subtarget) { | |||
| 24746 | // CF and OF aren't always set the way we want. Determine which | |||
| 24747 | // of these we need. | |||
| 24748 | bool NeedCF = false; | |||
| 24749 | bool NeedOF = false; | |||
| 24750 | switch (X86CC) { | |||
| 24751 | default: break; | |||
| 24752 | case X86::COND_A: case X86::COND_AE: | |||
| 24753 | case X86::COND_B: case X86::COND_BE: | |||
| 24754 | NeedCF = true; | |||
| 24755 | break; | |||
| 24756 | case X86::COND_G: case X86::COND_GE: | |||
| 24757 | case X86::COND_L: case X86::COND_LE: | |||
| 24758 | case X86::COND_O: case X86::COND_NO: { | |||
| 24759 | // Check if we really need to set the | |||
| 24760 | // Overflow flag. If NoSignedWrap is present | |||
| 24761 | // that is not actually needed. | |||
| 24762 | switch (Op->getOpcode()) { | |||
| 24763 | case ISD::ADD: | |||
| 24764 | case ISD::SUB: | |||
| 24765 | case ISD::MUL: | |||
| 24766 | case ISD::SHL: | |||
| 24767 | if (Op.getNode()->getFlags().hasNoSignedWrap()) | |||
| 24768 | break; | |||
| 24769 | [[fallthrough]]; | |||
| 24770 | default: | |||
| 24771 | NeedOF = true; | |||
| 24772 | break; | |||
| 24773 | } | |||
| 24774 | break; | |||
| 24775 | } | |||
| 24776 | } | |||
| 24777 | // See if we can use the EFLAGS value from the operand instead of | |||
| 24778 | // doing a separate TEST. TEST always sets OF and CF to 0, so unless | |||
| 24779 | // we prove that the arithmetic won't overflow, we can't use OF or CF. | |||
| 24780 | if (Op.getResNo() != 0 || NeedOF || NeedCF) { | |||
| 24781 | // Emit a CMP with 0, which is the TEST pattern. | |||
| 24782 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, | |||
| 24783 | DAG.getConstant(0, dl, Op.getValueType())); | |||
| 24784 | } | |||
| 24785 | unsigned Opcode = 0; | |||
| 24786 | unsigned NumOperands = 0; | |||
| 24787 | ||||
| 24788 | SDValue ArithOp = Op; | |||
| 24789 | ||||
| 24790 | // NOTICE: In the code below we use ArithOp to hold the arithmetic operation | |||
| 24791 | // which may be the result of a CAST. We use the variable 'Op', which is the | |||
| 24792 | // non-casted variable when we check for possible users. | |||
| 24793 | switch (ArithOp.getOpcode()) { | |||
| 24794 | case ISD::AND: | |||
| 24795 | // If the primary 'and' result isn't used, don't bother using X86ISD::AND, | |||
| 24796 | // because a TEST instruction will be better. | |||
| 24797 | if (!hasNonFlagsUse(Op)) | |||
| 24798 | break; | |||
| 24799 | ||||
| 24800 | [[fallthrough]]; | |||
| 24801 | case ISD::ADD: | |||
| 24802 | case ISD::SUB: | |||
| 24803 | case ISD::OR: | |||
| 24804 | case ISD::XOR: | |||
| 24805 | if (!isProfitableToUseFlagOp(Op)) | |||
| 24806 | break; | |||
| 24807 | ||||
| 24808 | // Otherwise use a regular EFLAGS-setting instruction. | |||
| 24809 | switch (ArithOp.getOpcode()) { | |||
| 24810 | default: llvm_unreachable("unexpected operator!")::llvm::llvm_unreachable_internal("unexpected operator!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 24810); | |||
| 24811 | case ISD::ADD: Opcode = X86ISD::ADD; break; | |||
| 24812 | case ISD::SUB: Opcode = X86ISD::SUB; break; | |||
| 24813 | case ISD::XOR: Opcode = X86ISD::XOR; break; | |||
| 24814 | case ISD::AND: Opcode = X86ISD::AND; break; | |||
| 24815 | case ISD::OR: Opcode = X86ISD::OR; break; | |||
| 24816 | } | |||
| 24817 | ||||
| 24818 | NumOperands = 2; | |||
| 24819 | break; | |||
| 24820 | case X86ISD::ADD: | |||
| 24821 | case X86ISD::SUB: | |||
| 24822 | case X86ISD::OR: | |||
| 24823 | case X86ISD::XOR: | |||
| 24824 | case X86ISD::AND: | |||
| 24825 | return SDValue(Op.getNode(), 1); | |||
| 24826 | case ISD::SSUBO: | |||
| 24827 | case ISD::USUBO: { | |||
| 24828 | // /USUBO/SSUBO will become a X86ISD::SUB and we can use its Z flag. | |||
| 24829 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); | |||
| 24830 | return DAG.getNode(X86ISD::SUB, dl, VTs, Op->getOperand(0), | |||
| 24831 | Op->getOperand(1)).getValue(1); | |||
| 24832 | } | |||
| 24833 | default: | |||
| 24834 | break; | |||
| 24835 | } | |||
| 24836 | ||||
| 24837 | if (Opcode == 0) { | |||
| 24838 | // Emit a CMP with 0, which is the TEST pattern. | |||
| 24839 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, | |||
| 24840 | DAG.getConstant(0, dl, Op.getValueType())); | |||
| 24841 | } | |||
| 24842 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); | |||
| 24843 | SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands); | |||
| 24844 | ||||
| 24845 | SDValue New = DAG.getNode(Opcode, dl, VTs, Ops); | |||
| 24846 | DAG.ReplaceAllUsesOfValueWith(SDValue(Op.getNode(), 0), New); | |||
| 24847 | return SDValue(New.getNode(), 1); | |||
| 24848 | } | |||
| 24849 | ||||
| 24850 | /// Emit nodes that will be selected as "cmp Op0,Op1", or something | |||
| 24851 | /// equivalent. | |||
| 24852 | static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, | |||
| 24853 | const SDLoc &dl, SelectionDAG &DAG, | |||
| 24854 | const X86Subtarget &Subtarget) { | |||
| 24855 | if (isNullConstant(Op1)) | |||
| 24856 | return EmitTest(Op0, X86CC, dl, DAG, Subtarget); | |||
| 24857 | ||||
| 24858 | EVT CmpVT = Op0.getValueType(); | |||
| 24859 | ||||
| 24860 | assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT:: i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!" ) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__ __PRETTY_FUNCTION__)) | |||
| 24861 | CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((CmpVT == MVT::i8 || CmpVT == MVT:: i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!" ) ? void (0) : __assert_fail ("(CmpVT == MVT::i8 || CmpVT == MVT::i16 || CmpVT == MVT::i32 || CmpVT == MVT::i64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24861, __extension__ __PRETTY_FUNCTION__)); | |||
| 24862 | ||||
| 24863 | // Only promote the compare up to I32 if it is a 16 bit operation | |||
| 24864 | // with an immediate. 16 bit immediates are to be avoided. | |||
| 24865 | if (CmpVT == MVT::i16 && !Subtarget.isAtom() && | |||
| 24866 | !DAG.getMachineFunction().getFunction().hasMinSize()) { | |||
| 24867 | ConstantSDNode *COp0 = dyn_cast<ConstantSDNode>(Op0); | |||
| 24868 | ConstantSDNode *COp1 = dyn_cast<ConstantSDNode>(Op1); | |||
| 24869 | // Don't do this if the immediate can fit in 8-bits. | |||
| 24870 | if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) || | |||
| 24871 | (COp1 && !COp1->getAPIntValue().isSignedIntN(8))) { | |||
| 24872 | unsigned ExtendOp = | |||
| 24873 | isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 24874 | if (X86CC == X86::COND_E || X86CC == X86::COND_NE) { | |||
| 24875 | // For equality comparisons try to use SIGN_EXTEND if the input was | |||
| 24876 | // truncate from something with enough sign bits. | |||
| 24877 | if (Op0.getOpcode() == ISD::TRUNCATE) { | |||
| 24878 | if (DAG.ComputeMaxSignificantBits(Op0.getOperand(0)) <= 16) | |||
| 24879 | ExtendOp = ISD::SIGN_EXTEND; | |||
| 24880 | } else if (Op1.getOpcode() == ISD::TRUNCATE) { | |||
| 24881 | if (DAG.ComputeMaxSignificantBits(Op1.getOperand(0)) <= 16) | |||
| 24882 | ExtendOp = ISD::SIGN_EXTEND; | |||
| 24883 | } | |||
| 24884 | } | |||
| 24885 | ||||
| 24886 | CmpVT = MVT::i32; | |||
| 24887 | Op0 = DAG.getNode(ExtendOp, dl, CmpVT, Op0); | |||
| 24888 | Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1); | |||
| 24889 | } | |||
| 24890 | } | |||
| 24891 | ||||
| 24892 | // Try to shrink i64 compares if the input has enough zero bits. | |||
| 24893 | // FIXME: Do this for non-constant compares for constant on LHS? | |||
| 24894 | if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) && | |||
| 24895 | Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub. | |||
| 24896 | cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 && | |||
| 24897 | DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) { | |||
| 24898 | CmpVT = MVT::i32; | |||
| 24899 | Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0); | |||
| 24900 | Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1); | |||
| 24901 | } | |||
| 24902 | ||||
| 24903 | // 0-x == y --> x+y == 0 | |||
| 24904 | // 0-x != y --> x+y != 0 | |||
| 24905 | if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) && | |||
| 24906 | Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { | |||
| 24907 | SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); | |||
| 24908 | SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1); | |||
| 24909 | return Add.getValue(1); | |||
| 24910 | } | |||
| 24911 | ||||
| 24912 | // x == 0-y --> x+y == 0 | |||
| 24913 | // x != 0-y --> x+y != 0 | |||
| 24914 | if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) && | |||
| 24915 | Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) { | |||
| 24916 | SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); | |||
| 24917 | SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1)); | |||
| 24918 | return Add.getValue(1); | |||
| 24919 | } | |||
| 24920 | ||||
| 24921 | // Use SUB instead of CMP to enable CSE between SUB and CMP. | |||
| 24922 | SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32); | |||
| 24923 | SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); | |||
| 24924 | return Sub.getValue(1); | |||
| 24925 | } | |||
| 24926 | ||||
| 24927 | /// Check if replacement of SQRT with RSQRT should be disabled. | |||
| 24928 | bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const { | |||
| 24929 | EVT VT = Op.getValueType(); | |||
| 24930 | ||||
| 24931 | // We don't need to replace SQRT with RSQRT for half type. | |||
| 24932 | if (VT.getScalarType() == MVT::f16) | |||
| 24933 | return true; | |||
| 24934 | ||||
| 24935 | // We never want to use both SQRT and RSQRT instructions for the same input. | |||
| 24936 | if (DAG.doesNodeExist(X86ISD::FRSQRT, DAG.getVTList(VT), Op)) | |||
| 24937 | return false; | |||
| 24938 | ||||
| 24939 | if (VT.isVector()) | |||
| 24940 | return Subtarget.hasFastVectorFSQRT(); | |||
| 24941 | return Subtarget.hasFastScalarFSQRT(); | |||
| 24942 | } | |||
| 24943 | ||||
| 24944 | /// The minimum architected relative accuracy is 2^-12. We need one | |||
| 24945 | /// Newton-Raphson step to have a good float result (24 bits of precision). | |||
| 24946 | SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, | |||
| 24947 | SelectionDAG &DAG, int Enabled, | |||
| 24948 | int &RefinementSteps, | |||
| 24949 | bool &UseOneConstNR, | |||
| 24950 | bool Reciprocal) const { | |||
| 24951 | SDLoc DL(Op); | |||
| 24952 | EVT VT = Op.getValueType(); | |||
| 24953 | ||||
| 24954 | // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps. | |||
| 24955 | // It is likely not profitable to do this for f64 because a double-precision | |||
| 24956 | // rsqrt estimate with refinement on x86 prior to FMA requires at least 16 | |||
| 24957 | // instructions: convert to single, rsqrtss, convert back to double, refine | |||
| 24958 | // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA | |||
| 24959 | // along with FMA, this could be a throughput win. | |||
| 24960 | // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 | |||
| 24961 | // after legalize types. | |||
| 24962 | if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | |||
| 24963 | (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || | |||
| 24964 | (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || | |||
| 24965 | (VT == MVT::v8f32 && Subtarget.hasAVX()) || | |||
| 24966 | (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | |||
| 24967 | if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||
| 24968 | RefinementSteps = 1; | |||
| 24969 | ||||
| 24970 | UseOneConstNR = false; | |||
| 24971 | // There is no FSQRT for 512-bits, but there is RSQRT14. | |||
| 24972 | unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RSQRT14 : X86ISD::FRSQRT; | |||
| 24973 | SDValue Estimate = DAG.getNode(Opcode, DL, VT, Op); | |||
| 24974 | if (RefinementSteps == 0 && !Reciprocal) | |||
| 24975 | Estimate = DAG.getNode(ISD::FMUL, DL, VT, Op, Estimate); | |||
| 24976 | return Estimate; | |||
| 24977 | } | |||
| 24978 | ||||
| 24979 | if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && | |||
| 24980 | Subtarget.hasFP16()) { | |||
| 24981 | assert(Reciprocal && "Don't replace SQRT with RSQRT for half type")(static_cast <bool> (Reciprocal && "Don't replace SQRT with RSQRT for half type" ) ? void (0) : __assert_fail ("Reciprocal && \"Don't replace SQRT with RSQRT for half type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 24981, __extension__ __PRETTY_FUNCTION__)); | |||
| 24982 | if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||
| 24983 | RefinementSteps = 0; | |||
| 24984 | ||||
| 24985 | if (VT == MVT::f16) { | |||
| 24986 | SDValue Zero = DAG.getIntPtrConstant(0, DL); | |||
| 24987 | SDValue Undef = DAG.getUNDEF(MVT::v8f16); | |||
| 24988 | Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); | |||
| 24989 | Op = DAG.getNode(X86ISD::RSQRT14S, DL, MVT::v8f16, Undef, Op); | |||
| 24990 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); | |||
| 24991 | } | |||
| 24992 | ||||
| 24993 | return DAG.getNode(X86ISD::RSQRT14, DL, VT, Op); | |||
| 24994 | } | |||
| 24995 | return SDValue(); | |||
| 24996 | } | |||
| 24997 | ||||
| 24998 | /// The minimum architected relative accuracy is 2^-12. We need one | |||
| 24999 | /// Newton-Raphson step to have a good float result (24 bits of precision). | |||
| 25000 | SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG, | |||
| 25001 | int Enabled, | |||
| 25002 | int &RefinementSteps) const { | |||
| 25003 | SDLoc DL(Op); | |||
| 25004 | EVT VT = Op.getValueType(); | |||
| 25005 | ||||
| 25006 | // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps. | |||
| 25007 | // It is likely not profitable to do this for f64 because a double-precision | |||
| 25008 | // reciprocal estimate with refinement on x86 prior to FMA requires | |||
| 25009 | // 15 instructions: convert to single, rcpss, convert back to double, refine | |||
| 25010 | // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA | |||
| 25011 | // along with FMA, this could be a throughput win. | |||
| 25012 | ||||
| 25013 | if ((VT == MVT::f32 && Subtarget.hasSSE1()) || | |||
| 25014 | (VT == MVT::v4f32 && Subtarget.hasSSE1()) || | |||
| 25015 | (VT == MVT::v8f32 && Subtarget.hasAVX()) || | |||
| 25016 | (VT == MVT::v16f32 && Subtarget.useAVX512Regs())) { | |||
| 25017 | // Enable estimate codegen with 1 refinement step for vector division. | |||
| 25018 | // Scalar division estimates are disabled because they break too much | |||
| 25019 | // real-world code. These defaults are intended to match GCC behavior. | |||
| 25020 | if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified) | |||
| 25021 | return SDValue(); | |||
| 25022 | ||||
| 25023 | if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||
| 25024 | RefinementSteps = 1; | |||
| 25025 | ||||
| 25026 | // There is no FSQRT for 512-bits, but there is RCP14. | |||
| 25027 | unsigned Opcode = VT == MVT::v16f32 ? X86ISD::RCP14 : X86ISD::FRCP; | |||
| 25028 | return DAG.getNode(Opcode, DL, VT, Op); | |||
| 25029 | } | |||
| 25030 | ||||
| 25031 | if (VT.getScalarType() == MVT::f16 && isTypeLegal(VT) && | |||
| 25032 | Subtarget.hasFP16()) { | |||
| 25033 | if (RefinementSteps == ReciprocalEstimate::Unspecified) | |||
| 25034 | RefinementSteps = 0; | |||
| 25035 | ||||
| 25036 | if (VT == MVT::f16) { | |||
| 25037 | SDValue Zero = DAG.getIntPtrConstant(0, DL); | |||
| 25038 | SDValue Undef = DAG.getUNDEF(MVT::v8f16); | |||
| 25039 | Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v8f16, Op); | |||
| 25040 | Op = DAG.getNode(X86ISD::RCP14S, DL, MVT::v8f16, Undef, Op); | |||
| 25041 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Op, Zero); | |||
| 25042 | } | |||
| 25043 | ||||
| 25044 | return DAG.getNode(X86ISD::RCP14, DL, VT, Op); | |||
| 25045 | } | |||
| 25046 | return SDValue(); | |||
| 25047 | } | |||
| 25048 | ||||
| 25049 | /// If we have at least two divisions that use the same divisor, convert to | |||
| 25050 | /// multiplication by a reciprocal. This may need to be adjusted for a given | |||
| 25051 | /// CPU if a division's cost is not at least twice the cost of a multiplication. | |||
| 25052 | /// This is because we still need one division to calculate the reciprocal and | |||
| 25053 | /// then we need two multiplies by that reciprocal as replacements for the | |||
| 25054 | /// original divisions. | |||
| 25055 | unsigned X86TargetLowering::combineRepeatedFPDivisors() const { | |||
| 25056 | return 2; | |||
| 25057 | } | |||
| 25058 | ||||
| 25059 | SDValue | |||
| 25060 | X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, | |||
| 25061 | SelectionDAG &DAG, | |||
| 25062 | SmallVectorImpl<SDNode *> &Created) const { | |||
| 25063 | AttributeList Attr = DAG.getMachineFunction().getFunction().getAttributes(); | |||
| 25064 | if (isIntDivCheap(N->getValueType(0), Attr)) | |||
| 25065 | return SDValue(N,0); // Lower SDIV as SDIV | |||
| 25066 | ||||
| 25067 | assert((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) &&(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2 ()) && "Unexpected divisor!") ? void (0) : __assert_fail ("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__ __PRETTY_FUNCTION__)) | |||
| 25068 | "Unexpected divisor!")(static_cast <bool> ((Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2 ()) && "Unexpected divisor!") ? void (0) : __assert_fail ("(Divisor.isPowerOf2() || Divisor.isNegatedPowerOf2()) && \"Unexpected divisor!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25068, __extension__ __PRETTY_FUNCTION__)); | |||
| 25069 | ||||
| 25070 | // Only perform this transform if CMOV is supported otherwise the select | |||
| 25071 | // below will become a branch. | |||
| 25072 | if (!Subtarget.canUseCMOV()) | |||
| 25073 | return SDValue(); | |||
| 25074 | ||||
| 25075 | // fold (sdiv X, pow2) | |||
| 25076 | EVT VT = N->getValueType(0); | |||
| 25077 | // FIXME: Support i8. | |||
| 25078 | if (VT != MVT::i16 && VT != MVT::i32 && | |||
| 25079 | !(Subtarget.is64Bit() && VT == MVT::i64)) | |||
| 25080 | return SDValue(); | |||
| 25081 | ||||
| 25082 | unsigned Lg2 = Divisor.countr_zero(); | |||
| 25083 | ||||
| 25084 | // If the divisor is 2 or -2, the default expansion is better. | |||
| 25085 | if (Lg2 == 1) | |||
| 25086 | return SDValue(); | |||
| 25087 | ||||
| 25088 | SDLoc DL(N); | |||
| 25089 | SDValue N0 = N->getOperand(0); | |||
| 25090 | SDValue Zero = DAG.getConstant(0, DL, VT); | |||
| 25091 | APInt Lg2Mask = APInt::getLowBitsSet(VT.getSizeInBits(), Lg2); | |||
| 25092 | SDValue Pow2MinusOne = DAG.getConstant(Lg2Mask, DL, VT); | |||
| 25093 | ||||
| 25094 | // If N0 is negative, we need to add (Pow2 - 1) to it before shifting right. | |||
| 25095 | SDValue Cmp = DAG.getSetCC(DL, MVT::i8, N0, Zero, ISD::SETLT); | |||
| 25096 | SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N0, Pow2MinusOne); | |||
| 25097 | SDValue CMov = DAG.getNode(ISD::SELECT, DL, VT, Cmp, Add, N0); | |||
| 25098 | ||||
| 25099 | Created.push_back(Cmp.getNode()); | |||
| 25100 | Created.push_back(Add.getNode()); | |||
| 25101 | Created.push_back(CMov.getNode()); | |||
| 25102 | ||||
| 25103 | // Divide by pow2. | |||
| 25104 | SDValue SRA = | |||
| 25105 | DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8)); | |||
| 25106 | ||||
| 25107 | // If we're dividing by a positive value, we're done. Otherwise, we must | |||
| 25108 | // negate the result. | |||
| 25109 | if (Divisor.isNonNegative()) | |||
| 25110 | return SRA; | |||
| 25111 | ||||
| 25112 | Created.push_back(SRA.getNode()); | |||
| 25113 | return DAG.getNode(ISD::SUB, DL, VT, Zero, SRA); | |||
| 25114 | } | |||
| 25115 | ||||
| 25116 | /// Result of 'and' is compared against zero. Change to a BT node if possible. | |||
| 25117 | /// Returns the BT node and the condition code needed to use it. | |||
| 25118 | static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, | |||
| 25119 | SelectionDAG &DAG, X86::CondCode &X86CC) { | |||
| 25120 | assert(And.getOpcode() == ISD::AND && "Expected AND node!")(static_cast <bool> (And.getOpcode() == ISD::AND && "Expected AND node!") ? void (0) : __assert_fail ("And.getOpcode() == ISD::AND && \"Expected AND node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25120, __extension__ __PRETTY_FUNCTION__)); | |||
| 25121 | SDValue Op0 = And.getOperand(0); | |||
| 25122 | SDValue Op1 = And.getOperand(1); | |||
| 25123 | if (Op0.getOpcode() == ISD::TRUNCATE) | |||
| 25124 | Op0 = Op0.getOperand(0); | |||
| 25125 | if (Op1.getOpcode() == ISD::TRUNCATE) | |||
| 25126 | Op1 = Op1.getOperand(0); | |||
| 25127 | ||||
| 25128 | SDValue Src, BitNo; | |||
| 25129 | if (Op1.getOpcode() == ISD::SHL) | |||
| 25130 | std::swap(Op0, Op1); | |||
| 25131 | if (Op0.getOpcode() == ISD::SHL) { | |||
| 25132 | if (isOneConstant(Op0.getOperand(0))) { | |||
| 25133 | // If we looked past a truncate, check that it's only truncating away | |||
| 25134 | // known zeros. | |||
| 25135 | unsigned BitWidth = Op0.getValueSizeInBits(); | |||
| 25136 | unsigned AndBitWidth = And.getValueSizeInBits(); | |||
| 25137 | if (BitWidth > AndBitWidth) { | |||
| 25138 | KnownBits Known = DAG.computeKnownBits(Op0); | |||
| 25139 | if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) | |||
| 25140 | return SDValue(); | |||
| 25141 | } | |||
| 25142 | Src = Op1; | |||
| 25143 | BitNo = Op0.getOperand(1); | |||
| 25144 | } | |||
| 25145 | } else if (Op1.getOpcode() == ISD::Constant) { | |||
| 25146 | ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1); | |||
| 25147 | uint64_t AndRHSVal = AndRHS->getZExtValue(); | |||
| 25148 | SDValue AndLHS = Op0; | |||
| 25149 | ||||
| 25150 | if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) { | |||
| 25151 | Src = AndLHS.getOperand(0); | |||
| 25152 | BitNo = AndLHS.getOperand(1); | |||
| 25153 | } else { | |||
| 25154 | // Use BT if the immediate can't be encoded in a TEST instruction or we | |||
| 25155 | // are optimizing for size and the immedaite won't fit in a byte. | |||
| 25156 | bool OptForSize = DAG.shouldOptForSize(); | |||
| 25157 | if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) && | |||
| 25158 | isPowerOf2_64(AndRHSVal)) { | |||
| 25159 | Src = AndLHS; | |||
| 25160 | BitNo = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, | |||
| 25161 | Src.getValueType()); | |||
| 25162 | } | |||
| 25163 | } | |||
| 25164 | } | |||
| 25165 | ||||
| 25166 | // No patterns found, give up. | |||
| 25167 | if (!Src.getNode()) | |||
| 25168 | return SDValue(); | |||
| 25169 | ||||
| 25170 | // Remove any bit flip. | |||
| 25171 | if (isBitwiseNot(Src)) { | |||
| 25172 | Src = Src.getOperand(0); | |||
| 25173 | CC = CC == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ; | |||
| 25174 | } | |||
| 25175 | ||||
| 25176 | // Attempt to create the X86ISD::BT node. | |||
| 25177 | if (SDValue BT = getBT(Src, BitNo, dl, DAG)) { | |||
| 25178 | X86CC = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; | |||
| 25179 | return BT; | |||
| 25180 | } | |||
| 25181 | ||||
| 25182 | return SDValue(); | |||
| 25183 | } | |||
| 25184 | ||||
| 25185 | // Check if pre-AVX condcode can be performed by a single FCMP op. | |||
| 25186 | static bool cheapX86FSETCC_SSE(ISD::CondCode SetCCOpcode) { | |||
| 25187 | return (SetCCOpcode != ISD::SETONE) && (SetCCOpcode != ISD::SETUEQ); | |||
| 25188 | } | |||
| 25189 | ||||
| 25190 | /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask | |||
| 25191 | /// CMPs. | |||
| 25192 | static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, | |||
| 25193 | SDValue &Op1, bool &IsAlwaysSignaling) { | |||
| 25194 | unsigned SSECC; | |||
| 25195 | bool Swap = false; | |||
| 25196 | ||||
| 25197 | // SSE Condition code mapping: | |||
| 25198 | // 0 - EQ | |||
| 25199 | // 1 - LT | |||
| 25200 | // 2 - LE | |||
| 25201 | // 3 - UNORD | |||
| 25202 | // 4 - NEQ | |||
| 25203 | // 5 - NLT | |||
| 25204 | // 6 - NLE | |||
| 25205 | // 7 - ORD | |||
| 25206 | switch (SetCCOpcode) { | |||
| 25207 | default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25207); | |||
| 25208 | case ISD::SETOEQ: | |||
| 25209 | case ISD::SETEQ: SSECC = 0; break; | |||
| 25210 | case ISD::SETOGT: | |||
| 25211 | case ISD::SETGT: Swap = true; [[fallthrough]]; | |||
| 25212 | case ISD::SETLT: | |||
| 25213 | case ISD::SETOLT: SSECC = 1; break; | |||
| 25214 | case ISD::SETOGE: | |||
| 25215 | case ISD::SETGE: Swap = true; [[fallthrough]]; | |||
| 25216 | case ISD::SETLE: | |||
| 25217 | case ISD::SETOLE: SSECC = 2; break; | |||
| 25218 | case ISD::SETUO: SSECC = 3; break; | |||
| 25219 | case ISD::SETUNE: | |||
| 25220 | case ISD::SETNE: SSECC = 4; break; | |||
| 25221 | case ISD::SETULE: Swap = true; [[fallthrough]]; | |||
| 25222 | case ISD::SETUGE: SSECC = 5; break; | |||
| 25223 | case ISD::SETULT: Swap = true; [[fallthrough]]; | |||
| 25224 | case ISD::SETUGT: SSECC = 6; break; | |||
| 25225 | case ISD::SETO: SSECC = 7; break; | |||
| 25226 | case ISD::SETUEQ: SSECC = 8; break; | |||
| 25227 | case ISD::SETONE: SSECC = 12; break; | |||
| 25228 | } | |||
| 25229 | if (Swap) | |||
| 25230 | std::swap(Op0, Op1); | |||
| 25231 | ||||
| 25232 | switch (SetCCOpcode) { | |||
| 25233 | default: | |||
| 25234 | IsAlwaysSignaling = true; | |||
| 25235 | break; | |||
| 25236 | case ISD::SETEQ: | |||
| 25237 | case ISD::SETOEQ: | |||
| 25238 | case ISD::SETUEQ: | |||
| 25239 | case ISD::SETNE: | |||
| 25240 | case ISD::SETONE: | |||
| 25241 | case ISD::SETUNE: | |||
| 25242 | case ISD::SETO: | |||
| 25243 | case ISD::SETUO: | |||
| 25244 | IsAlwaysSignaling = false; | |||
| 25245 | break; | |||
| 25246 | } | |||
| 25247 | ||||
| 25248 | return SSECC; | |||
| 25249 | } | |||
| 25250 | ||||
| 25251 | /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then | |||
| 25252 | /// concatenate the result back. | |||
| 25253 | static SDValue splitIntVSETCC(EVT VT, SDValue LHS, SDValue RHS, | |||
| 25254 | ISD::CondCode Cond, SelectionDAG &DAG, | |||
| 25255 | const SDLoc &dl) { | |||
| 25256 | assert(VT.isInteger() && VT == LHS.getValueType() &&(static_cast <bool> (VT.isInteger() && VT == LHS .getValueType() && VT == RHS.getValueType() && "Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__ __PRETTY_FUNCTION__)) | |||
| 25257 | VT == RHS.getValueType() && "Unsupported VTs!")(static_cast <bool> (VT.isInteger() && VT == LHS .getValueType() && VT == RHS.getValueType() && "Unsupported VTs!") ? void (0) : __assert_fail ("VT.isInteger() && VT == LHS.getValueType() && VT == RHS.getValueType() && \"Unsupported VTs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25257, __extension__ __PRETTY_FUNCTION__)); | |||
| 25258 | ||||
| 25259 | SDValue CC = DAG.getCondCode(Cond); | |||
| 25260 | ||||
| 25261 | // Extract the LHS Lo/Hi vectors | |||
| 25262 | SDValue LHS1, LHS2; | |||
| 25263 | std::tie(LHS1, LHS2) = splitVector(LHS, DAG, dl); | |||
| 25264 | ||||
| 25265 | // Extract the RHS Lo/Hi vectors | |||
| 25266 | SDValue RHS1, RHS2; | |||
| 25267 | std::tie(RHS1, RHS2) = splitVector(RHS, DAG, dl); | |||
| 25268 | ||||
| 25269 | // Issue the operation on the smaller types and concatenate the result back | |||
| 25270 | EVT LoVT, HiVT; | |||
| 25271 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 25272 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, | |||
| 25273 | DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC), | |||
| 25274 | DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC)); | |||
| 25275 | } | |||
| 25276 | ||||
| 25277 | static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { | |||
| 25278 | ||||
| 25279 | SDValue Op0 = Op.getOperand(0); | |||
| 25280 | SDValue Op1 = Op.getOperand(1); | |||
| 25281 | SDValue CC = Op.getOperand(2); | |||
| 25282 | MVT VT = Op.getSimpleValueType(); | |||
| 25283 | SDLoc dl(Op); | |||
| 25284 | ||||
| 25285 | assert(VT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (VT.getVectorElementType() == MVT:: i1 && "Cannot set masked compare for this operation") ? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__ __PRETTY_FUNCTION__)) | |||
| 25286 | "Cannot set masked compare for this operation")(static_cast <bool> (VT.getVectorElementType() == MVT:: i1 && "Cannot set masked compare for this operation") ? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i1 && \"Cannot set masked compare for this operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25286, __extension__ __PRETTY_FUNCTION__)); | |||
| 25287 | ||||
| 25288 | ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get(); | |||
| 25289 | ||||
| 25290 | // Prefer SETGT over SETLT. | |||
| 25291 | if (SetCCOpcode == ISD::SETLT) { | |||
| 25292 | SetCCOpcode = ISD::getSetCCSwappedOperands(SetCCOpcode); | |||
| 25293 | std::swap(Op0, Op1); | |||
| 25294 | } | |||
| 25295 | ||||
| 25296 | return DAG.getSetCC(dl, VT, Op0, Op1, SetCCOpcode); | |||
| 25297 | } | |||
| 25298 | ||||
| 25299 | /// Given a buildvector constant, return a new vector constant with each element | |||
| 25300 | /// incremented or decremented. If incrementing or decrementing would result in | |||
| 25301 | /// unsigned overflow or underflow or this is not a simple vector constant, | |||
| 25302 | /// return an empty value. | |||
| 25303 | static SDValue incDecVectorConstant(SDValue V, SelectionDAG &DAG, bool IsInc, | |||
| 25304 | bool NSW) { | |||
| 25305 | auto *BV = dyn_cast<BuildVectorSDNode>(V.getNode()); | |||
| 25306 | if (!BV || !V.getValueType().isSimple()) | |||
| 25307 | return SDValue(); | |||
| 25308 | ||||
| 25309 | MVT VT = V.getSimpleValueType(); | |||
| 25310 | MVT EltVT = VT.getVectorElementType(); | |||
| 25311 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 25312 | SmallVector<SDValue, 8> NewVecC; | |||
| 25313 | SDLoc DL(V); | |||
| 25314 | for (unsigned i = 0; i < NumElts; ++i) { | |||
| 25315 | auto *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i)); | |||
| 25316 | if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EltVT) | |||
| 25317 | return SDValue(); | |||
| 25318 | ||||
| 25319 | // Avoid overflow/underflow. | |||
| 25320 | const APInt &EltC = Elt->getAPIntValue(); | |||
| 25321 | if ((IsInc && EltC.isMaxValue()) || (!IsInc && EltC.isZero())) | |||
| 25322 | return SDValue(); | |||
| 25323 | if (NSW && ((IsInc && EltC.isMaxSignedValue()) || | |||
| 25324 | (!IsInc && EltC.isMinSignedValue()))) | |||
| 25325 | return SDValue(); | |||
| 25326 | ||||
| 25327 | NewVecC.push_back(DAG.getConstant(EltC + (IsInc ? 1 : -1), DL, EltVT)); | |||
| 25328 | } | |||
| 25329 | ||||
| 25330 | return DAG.getBuildVector(VT, DL, NewVecC); | |||
| 25331 | } | |||
| 25332 | ||||
| 25333 | /// As another special case, use PSUBUS[BW] when it's profitable. E.g. for | |||
| 25334 | /// Op0 u<= Op1: | |||
| 25335 | /// t = psubus Op0, Op1 | |||
| 25336 | /// pcmpeq t, <0..0> | |||
| 25337 | static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT, | |||
| 25338 | ISD::CondCode Cond, const SDLoc &dl, | |||
| 25339 | const X86Subtarget &Subtarget, | |||
| 25340 | SelectionDAG &DAG) { | |||
| 25341 | if (!Subtarget.hasSSE2()) | |||
| 25342 | return SDValue(); | |||
| 25343 | ||||
| 25344 | MVT VET = VT.getVectorElementType(); | |||
| 25345 | if (VET != MVT::i8 && VET != MVT::i16) | |||
| 25346 | return SDValue(); | |||
| 25347 | ||||
| 25348 | switch (Cond) { | |||
| 25349 | default: | |||
| 25350 | return SDValue(); | |||
| 25351 | case ISD::SETULT: { | |||
| 25352 | // If the comparison is against a constant we can turn this into a | |||
| 25353 | // setule. With psubus, setule does not require a swap. This is | |||
| 25354 | // beneficial because the constant in the register is no longer | |||
| 25355 | // destructed as the destination so it can be hoisted out of a loop. | |||
| 25356 | // Only do this pre-AVX since vpcmp* is no longer destructive. | |||
| 25357 | if (Subtarget.hasAVX()) | |||
| 25358 | return SDValue(); | |||
| 25359 | SDValue ULEOp1 = | |||
| 25360 | incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false); | |||
| 25361 | if (!ULEOp1) | |||
| 25362 | return SDValue(); | |||
| 25363 | Op1 = ULEOp1; | |||
| 25364 | break; | |||
| 25365 | } | |||
| 25366 | case ISD::SETUGT: { | |||
| 25367 | // If the comparison is against a constant, we can turn this into a setuge. | |||
| 25368 | // This is beneficial because materializing a constant 0 for the PCMPEQ is | |||
| 25369 | // probably cheaper than XOR+PCMPGT using 2 different vector constants: | |||
| 25370 | // cmpgt (xor X, SignMaskC) CmpC --> cmpeq (usubsat (CmpC+1), X), 0 | |||
| 25371 | SDValue UGEOp1 = | |||
| 25372 | incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false); | |||
| 25373 | if (!UGEOp1) | |||
| 25374 | return SDValue(); | |||
| 25375 | Op1 = Op0; | |||
| 25376 | Op0 = UGEOp1; | |||
| 25377 | break; | |||
| 25378 | } | |||
| 25379 | // Psubus is better than flip-sign because it requires no inversion. | |||
| 25380 | case ISD::SETUGE: | |||
| 25381 | std::swap(Op0, Op1); | |||
| 25382 | break; | |||
| 25383 | case ISD::SETULE: | |||
| 25384 | break; | |||
| 25385 | } | |||
| 25386 | ||||
| 25387 | SDValue Result = DAG.getNode(ISD::USUBSAT, dl, VT, Op0, Op1); | |||
| 25388 | return DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result, | |||
| 25389 | DAG.getConstant(0, dl, VT)); | |||
| 25390 | } | |||
| 25391 | ||||
| 25392 | static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, | |||
| 25393 | SelectionDAG &DAG) { | |||
| 25394 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || | |||
| 25395 | Op.getOpcode() == ISD::STRICT_FSETCCS; | |||
| 25396 | SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); | |||
| 25397 | SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); | |||
| 25398 | SDValue CC = Op.getOperand(IsStrict ? 3 : 2); | |||
| 25399 | MVT VT = Op->getSimpleValueType(0); | |||
| 25400 | ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get(); | |||
| 25401 | bool isFP = Op1.getSimpleValueType().isFloatingPoint(); | |||
| 25402 | SDLoc dl(Op); | |||
| 25403 | ||||
| 25404 | if (isFP) { | |||
| 25405 | MVT EltVT = Op0.getSimpleValueType().getVectorElementType(); | |||
| 25406 | assert(EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64)(static_cast <bool> (EltVT == MVT::f16 || EltVT == MVT:: f32 || EltVT == MVT::f64) ? void (0) : __assert_fail ("EltVT == MVT::f16 || EltVT == MVT::f32 || EltVT == MVT::f64" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25406, __extension__ __PRETTY_FUNCTION__)); | |||
| 25407 | if (isSoftFP16(EltVT, Subtarget)) | |||
| 25408 | return SDValue(); | |||
| 25409 | ||||
| 25410 | bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; | |||
| 25411 | SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); | |||
| 25412 | ||||
| 25413 | // If we have a strict compare with a vXi1 result and the input is 128/256 | |||
| 25414 | // bits we can't use a masked compare unless we have VLX. If we use a wider | |||
| 25415 | // compare like we do for non-strict, we might trigger spurious exceptions | |||
| 25416 | // from the upper elements. Instead emit a AVX compare and convert to mask. | |||
| 25417 | unsigned Opc; | |||
| 25418 | if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 && | |||
| 25419 | (!IsStrict || Subtarget.hasVLX() || | |||
| 25420 | Op0.getSimpleValueType().is512BitVector())) { | |||
| 25421 | #ifndef NDEBUG | |||
| 25422 | unsigned Num = VT.getVectorNumElements(); | |||
| 25423 | assert(Num <= 16 || (Num == 32 && EltVT == MVT::f16))(static_cast <bool> (Num <= 16 || (Num == 32 && EltVT == MVT::f16)) ? void (0) : __assert_fail ("Num <= 16 || (Num == 32 && EltVT == MVT::f16)" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25423, __extension__ __PRETTY_FUNCTION__)); | |||
| 25424 | #endif | |||
| 25425 | Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM; | |||
| 25426 | } else { | |||
| 25427 | Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP; | |||
| 25428 | // The SSE/AVX packed FP comparison nodes are defined with a | |||
| 25429 | // floating-point vector result that matches the operand type. This allows | |||
| 25430 | // them to work with an SSE1 target (integer vector types are not legal). | |||
| 25431 | VT = Op0.getSimpleValueType(); | |||
| 25432 | } | |||
| 25433 | ||||
| 25434 | SDValue Cmp; | |||
| 25435 | bool IsAlwaysSignaling; | |||
| 25436 | unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling); | |||
| 25437 | if (!Subtarget.hasAVX()) { | |||
| 25438 | // TODO: We could use following steps to handle a quiet compare with | |||
| 25439 | // signaling encodings. | |||
| 25440 | // 1. Get ordered masks from a quiet ISD::SETO | |||
| 25441 | // 2. Use the masks to mask potential unordered elements in operand A, B | |||
| 25442 | // 3. Get the compare results of masked A, B | |||
| 25443 | // 4. Calculating final result using the mask and result from 3 | |||
| 25444 | // But currently, we just fall back to scalar operations. | |||
| 25445 | if (IsStrict && IsAlwaysSignaling && !IsSignaling) | |||
| 25446 | return SDValue(); | |||
| 25447 | ||||
| 25448 | // Insert an extra signaling instruction to raise exception. | |||
| 25449 | if (IsStrict && !IsAlwaysSignaling && IsSignaling) { | |||
| 25450 | SDValue SignalCmp = DAG.getNode( | |||
| 25451 | Opc, dl, {VT, MVT::Other}, | |||
| 25452 | {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS | |||
| 25453 | // FIXME: It seems we need to update the flags of all new strict nodes. | |||
| 25454 | // Otherwise, mayRaiseFPException in MI will return false due to | |||
| 25455 | // NoFPExcept = false by default. However, I didn't find it in other | |||
| 25456 | // patches. | |||
| 25457 | SignalCmp->setFlags(Op->getFlags()); | |||
| 25458 | Chain = SignalCmp.getValue(1); | |||
| 25459 | } | |||
| 25460 | ||||
| 25461 | // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), | |||
| 25462 | // emit two comparisons and a logic op to tie them together. | |||
| 25463 | if (!cheapX86FSETCC_SSE(Cond)) { | |||
| 25464 | // LLVM predicate is SETUEQ or SETONE. | |||
| 25465 | unsigned CC0, CC1; | |||
| 25466 | unsigned CombineOpc; | |||
| 25467 | if (Cond == ISD::SETUEQ) { | |||
| 25468 | CC0 = 3; // UNORD | |||
| 25469 | CC1 = 0; // EQ | |||
| 25470 | CombineOpc = X86ISD::FOR; | |||
| 25471 | } else { | |||
| 25472 | assert(Cond == ISD::SETONE)(static_cast <bool> (Cond == ISD::SETONE) ? void (0) : __assert_fail ("Cond == ISD::SETONE", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 25472, __extension__ __PRETTY_FUNCTION__)); | |||
| 25473 | CC0 = 7; // ORD | |||
| 25474 | CC1 = 4; // NEQ | |||
| 25475 | CombineOpc = X86ISD::FAND; | |||
| 25476 | } | |||
| 25477 | ||||
| 25478 | SDValue Cmp0, Cmp1; | |||
| 25479 | if (IsStrict) { | |||
| 25480 | Cmp0 = DAG.getNode( | |||
| 25481 | Opc, dl, {VT, MVT::Other}, | |||
| 25482 | {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)}); | |||
| 25483 | Cmp1 = DAG.getNode( | |||
| 25484 | Opc, dl, {VT, MVT::Other}, | |||
| 25485 | {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)}); | |||
| 25486 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1), | |||
| 25487 | Cmp1.getValue(1)); | |||
| 25488 | } else { | |||
| 25489 | Cmp0 = DAG.getNode( | |||
| 25490 | Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)); | |||
| 25491 | Cmp1 = DAG.getNode( | |||
| 25492 | Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)); | |||
| 25493 | } | |||
| 25494 | Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1); | |||
| 25495 | } else { | |||
| 25496 | if (IsStrict) { | |||
| 25497 | Cmp = DAG.getNode( | |||
| 25498 | Opc, dl, {VT, MVT::Other}, | |||
| 25499 | {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); | |||
| 25500 | Chain = Cmp.getValue(1); | |||
| 25501 | } else | |||
| 25502 | Cmp = DAG.getNode( | |||
| 25503 | Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); | |||
| 25504 | } | |||
| 25505 | } else { | |||
| 25506 | // Handle all other FP comparisons here. | |||
| 25507 | if (IsStrict) { | |||
| 25508 | // Make a flip on already signaling CCs before setting bit 4 of AVX CC. | |||
| 25509 | SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4; | |||
| 25510 | Cmp = DAG.getNode( | |||
| 25511 | Opc, dl, {VT, MVT::Other}, | |||
| 25512 | {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)}); | |||
| 25513 | Chain = Cmp.getValue(1); | |||
| 25514 | } else | |||
| 25515 | Cmp = DAG.getNode( | |||
| 25516 | Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)); | |||
| 25517 | } | |||
| 25518 | ||||
| 25519 | if (VT.getFixedSizeInBits() > | |||
| 25520 | Op.getSimpleValueType().getFixedSizeInBits()) { | |||
| 25521 | // We emitted a compare with an XMM/YMM result. Finish converting to a | |||
| 25522 | // mask register using a vptestm. | |||
| 25523 | EVT CastVT = EVT(VT).changeVectorElementTypeToInteger(); | |||
| 25524 | Cmp = DAG.getBitcast(CastVT, Cmp); | |||
| 25525 | Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp, | |||
| 25526 | DAG.getConstant(0, dl, CastVT), ISD::SETNE); | |||
| 25527 | } else { | |||
| 25528 | // If this is SSE/AVX CMPP, bitcast the result back to integer to match | |||
| 25529 | // the result type of SETCC. The bitcast is expected to be optimized | |||
| 25530 | // away during combining/isel. | |||
| 25531 | Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp); | |||
| 25532 | } | |||
| 25533 | ||||
| 25534 | if (IsStrict) | |||
| 25535 | return DAG.getMergeValues({Cmp, Chain}, dl); | |||
| 25536 | ||||
| 25537 | return Cmp; | |||
| 25538 | } | |||
| 25539 | ||||
| 25540 | assert(!IsStrict && "Strict SETCC only handles FP operands.")(static_cast <bool> (!IsStrict && "Strict SETCC only handles FP operands." ) ? void (0) : __assert_fail ("!IsStrict && \"Strict SETCC only handles FP operands.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25540, __extension__ __PRETTY_FUNCTION__)); | |||
| 25541 | ||||
| 25542 | MVT VTOp0 = Op0.getSimpleValueType(); | |||
| 25543 | (void)VTOp0; | |||
| 25544 | assert(VTOp0 == Op1.getSimpleValueType() &&(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!") ? void (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__ __PRETTY_FUNCTION__)) | |||
| 25545 | "Expected operands with same type!")(static_cast <bool> (VTOp0 == Op1.getSimpleValueType() && "Expected operands with same type!") ? void (0) : __assert_fail ("VTOp0 == Op1.getSimpleValueType() && \"Expected operands with same type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25545, __extension__ __PRETTY_FUNCTION__)); | |||
| 25546 | assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == VTOp0 .getVectorNumElements() && "Invalid number of packed elements for source and destination!" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__ __PRETTY_FUNCTION__)) | |||
| 25547 | "Invalid number of packed elements for source and destination!")(static_cast <bool> (VT.getVectorNumElements() == VTOp0 .getVectorNumElements() && "Invalid number of packed elements for source and destination!" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == VTOp0.getVectorNumElements() && \"Invalid number of packed elements for source and destination!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25547, __extension__ __PRETTY_FUNCTION__)); | |||
| 25548 | ||||
| 25549 | // The non-AVX512 code below works under the assumption that source and | |||
| 25550 | // destination types are the same. | |||
| 25551 | assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0 )) && "Value types for source and destination must be the same!" ) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__ __PRETTY_FUNCTION__)) | |||
| 25552 | "Value types for source and destination must be the same!")(static_cast <bool> ((Subtarget.hasAVX512() || (VT == VTOp0 )) && "Value types for source and destination must be the same!" ) ? void (0) : __assert_fail ("(Subtarget.hasAVX512() || (VT == VTOp0)) && \"Value types for source and destination must be the same!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25552, __extension__ __PRETTY_FUNCTION__)); | |||
| 25553 | ||||
| 25554 | // The result is boolean, but operands are int/float | |||
| 25555 | if (VT.getVectorElementType() == MVT::i1) { | |||
| 25556 | // In AVX-512 architecture setcc returns mask with i1 elements, | |||
| 25557 | // But there is no compare instruction for i8 and i16 elements in KNL. | |||
| 25558 | assert((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) &&(static_cast <bool> ((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && "Unexpected operand type" ) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__ __PRETTY_FUNCTION__)) | |||
| 25559 | "Unexpected operand type")(static_cast <bool> ((VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && "Unexpected operand type" ) ? void (0) : __assert_fail ("(VTOp0.getScalarSizeInBits() >= 32 || Subtarget.hasBWI()) && \"Unexpected operand type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25559, __extension__ __PRETTY_FUNCTION__)); | |||
| 25560 | return LowerIntVSETCC_AVX512(Op, DAG); | |||
| 25561 | } | |||
| 25562 | ||||
| 25563 | // Lower using XOP integer comparisons. | |||
| 25564 | if (VT.is128BitVector() && Subtarget.hasXOP()) { | |||
| 25565 | // Translate compare code to XOP PCOM compare mode. | |||
| 25566 | unsigned CmpMode = 0; | |||
| 25567 | switch (Cond) { | |||
| 25568 | default: llvm_unreachable("Unexpected SETCC condition")::llvm::llvm_unreachable_internal("Unexpected SETCC condition" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25568); | |||
| 25569 | case ISD::SETULT: | |||
| 25570 | case ISD::SETLT: CmpMode = 0x00; break; | |||
| 25571 | case ISD::SETULE: | |||
| 25572 | case ISD::SETLE: CmpMode = 0x01; break; | |||
| 25573 | case ISD::SETUGT: | |||
| 25574 | case ISD::SETGT: CmpMode = 0x02; break; | |||
| 25575 | case ISD::SETUGE: | |||
| 25576 | case ISD::SETGE: CmpMode = 0x03; break; | |||
| 25577 | case ISD::SETEQ: CmpMode = 0x04; break; | |||
| 25578 | case ISD::SETNE: CmpMode = 0x05; break; | |||
| 25579 | } | |||
| 25580 | ||||
| 25581 | // Are we comparing unsigned or signed integers? | |||
| 25582 | unsigned Opc = | |||
| 25583 | ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM; | |||
| 25584 | ||||
| 25585 | return DAG.getNode(Opc, dl, VT, Op0, Op1, | |||
| 25586 | DAG.getTargetConstant(CmpMode, dl, MVT::i8)); | |||
| 25587 | } | |||
| 25588 | ||||
| 25589 | // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. | |||
| 25590 | // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. | |||
| 25591 | if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { | |||
| 25592 | SDValue BC0 = peekThroughBitcasts(Op0); | |||
| 25593 | if (BC0.getOpcode() == ISD::AND) { | |||
| 25594 | APInt UndefElts; | |||
| 25595 | SmallVector<APInt, 64> EltBits; | |||
| 25596 | if (getTargetConstantBitsFromNode(BC0.getOperand(1), | |||
| 25597 | VT.getScalarSizeInBits(), UndefElts, | |||
| 25598 | EltBits, false, false)) { | |||
| 25599 | if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { | |||
| 25600 | Cond = ISD::SETEQ; | |||
| 25601 | Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); | |||
| 25602 | } | |||
| 25603 | } | |||
| 25604 | } | |||
| 25605 | } | |||
| 25606 | ||||
| 25607 | // ICMP_EQ(AND(X,C),C) -> SRA(SHL(X,LOG2(C)),BW-1) iff C is power-of-2. | |||
| 25608 | if (Cond == ISD::SETEQ && Op0.getOpcode() == ISD::AND && | |||
| 25609 | Op0.getOperand(1) == Op1 && Op0.hasOneUse()) { | |||
| 25610 | ConstantSDNode *C1 = isConstOrConstSplat(Op1); | |||
| 25611 | if (C1 && C1->getAPIntValue().isPowerOf2()) { | |||
| 25612 | unsigned BitWidth = VT.getScalarSizeInBits(); | |||
| 25613 | unsigned ShiftAmt = BitWidth - C1->getAPIntValue().logBase2() - 1; | |||
| 25614 | ||||
| 25615 | SDValue Result = Op0.getOperand(0); | |||
| 25616 | Result = DAG.getNode(ISD::SHL, dl, VT, Result, | |||
| 25617 | DAG.getConstant(ShiftAmt, dl, VT)); | |||
| 25618 | Result = DAG.getNode(ISD::SRA, dl, VT, Result, | |||
| 25619 | DAG.getConstant(BitWidth - 1, dl, VT)); | |||
| 25620 | return Result; | |||
| 25621 | } | |||
| 25622 | } | |||
| 25623 | ||||
| 25624 | // Break 256-bit integer vector compare into smaller ones. | |||
| 25625 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 25626 | return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); | |||
| 25627 | ||||
| 25628 | // Break 512-bit integer vector compare into smaller ones. | |||
| 25629 | // TODO: Try harder to use VPCMPx + VPMOV2x? | |||
| 25630 | if (VT.is512BitVector()) | |||
| 25631 | return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl); | |||
| 25632 | ||||
| 25633 | // If we have a limit constant, try to form PCMPGT (signed cmp) to avoid | |||
| 25634 | // not-of-PCMPEQ: | |||
| 25635 | // X != INT_MIN --> X >s INT_MIN | |||
| 25636 | // X != INT_MAX --> X <s INT_MAX --> INT_MAX >s X | |||
| 25637 | // +X != 0 --> +X >s 0 | |||
| 25638 | APInt ConstValue; | |||
| 25639 | if (Cond == ISD::SETNE && | |||
| 25640 | ISD::isConstantSplatVector(Op1.getNode(), ConstValue)) { | |||
| 25641 | if (ConstValue.isMinSignedValue()) | |||
| 25642 | Cond = ISD::SETGT; | |||
| 25643 | else if (ConstValue.isMaxSignedValue()) | |||
| 25644 | Cond = ISD::SETLT; | |||
| 25645 | else if (ConstValue.isZero() && DAG.SignBitIsZero(Op0)) | |||
| 25646 | Cond = ISD::SETGT; | |||
| 25647 | } | |||
| 25648 | ||||
| 25649 | // If both operands are known non-negative, then an unsigned compare is the | |||
| 25650 | // same as a signed compare and there's no need to flip signbits. | |||
| 25651 | // TODO: We could check for more general simplifications here since we're | |||
| 25652 | // computing known bits. | |||
| 25653 | bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) && | |||
| 25654 | !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1)); | |||
| 25655 | ||||
| 25656 | // Special case: Use min/max operations for unsigned compares. | |||
| 25657 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 25658 | if (ISD::isUnsignedIntSetCC(Cond) && | |||
| 25659 | (FlipSigns || ISD::isTrueWhenEqual(Cond)) && | |||
| 25660 | TLI.isOperationLegal(ISD::UMIN, VT)) { | |||
| 25661 | // If we have a constant operand, increment/decrement it and change the | |||
| 25662 | // condition to avoid an invert. | |||
| 25663 | if (Cond == ISD::SETUGT) { | |||
| 25664 | // X > C --> X >= (C+1) --> X == umax(X, C+1) | |||
| 25665 | if (SDValue UGTOp1 = | |||
| 25666 | incDecVectorConstant(Op1, DAG, /*IsInc*/ true, /*NSW*/ false)) { | |||
| 25667 | Op1 = UGTOp1; | |||
| 25668 | Cond = ISD::SETUGE; | |||
| 25669 | } | |||
| 25670 | } | |||
| 25671 | if (Cond == ISD::SETULT) { | |||
| 25672 | // X < C --> X <= (C-1) --> X == umin(X, C-1) | |||
| 25673 | if (SDValue ULTOp1 = | |||
| 25674 | incDecVectorConstant(Op1, DAG, /*IsInc*/ false, /*NSW*/ false)) { | |||
| 25675 | Op1 = ULTOp1; | |||
| 25676 | Cond = ISD::SETULE; | |||
| 25677 | } | |||
| 25678 | } | |||
| 25679 | bool Invert = false; | |||
| 25680 | unsigned Opc; | |||
| 25681 | switch (Cond) { | |||
| 25682 | default: llvm_unreachable("Unexpected condition code")::llvm::llvm_unreachable_internal("Unexpected condition code" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25682); | |||
| 25683 | case ISD::SETUGT: Invert = true; [[fallthrough]]; | |||
| 25684 | case ISD::SETULE: Opc = ISD::UMIN; break; | |||
| 25685 | case ISD::SETULT: Invert = true; [[fallthrough]]; | |||
| 25686 | case ISD::SETUGE: Opc = ISD::UMAX; break; | |||
| 25687 | } | |||
| 25688 | ||||
| 25689 | SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); | |||
| 25690 | Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result); | |||
| 25691 | ||||
| 25692 | // If the logical-not of the result is required, perform that now. | |||
| 25693 | if (Invert) | |||
| 25694 | Result = DAG.getNOT(dl, Result, VT); | |||
| 25695 | ||||
| 25696 | return Result; | |||
| 25697 | } | |||
| 25698 | ||||
| 25699 | // Try to use SUBUS and PCMPEQ. | |||
| 25700 | if (FlipSigns) | |||
| 25701 | if (SDValue V = | |||
| 25702 | LowerVSETCCWithSUBUS(Op0, Op1, VT, Cond, dl, Subtarget, DAG)) | |||
| 25703 | return V; | |||
| 25704 | ||||
| 25705 | // We are handling one of the integer comparisons here. Since SSE only has | |||
| 25706 | // GT and EQ comparisons for integer, swapping operands and multiple | |||
| 25707 | // operations may be required for some comparisons. | |||
| 25708 | unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ | |||
| 25709 | : X86ISD::PCMPGT; | |||
| 25710 | bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT || | |||
| 25711 | Cond == ISD::SETGE || Cond == ISD::SETUGE; | |||
| 25712 | bool Invert = Cond == ISD::SETNE || | |||
| 25713 | (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond)); | |||
| 25714 | ||||
| 25715 | if (Swap) | |||
| 25716 | std::swap(Op0, Op1); | |||
| 25717 | ||||
| 25718 | // Check that the operation in question is available (most are plain SSE2, | |||
| 25719 | // but PCMPGTQ and PCMPEQQ have different requirements). | |||
| 25720 | if (VT == MVT::v2i64) { | |||
| 25721 | if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) { | |||
| 25722 | assert(Subtarget.hasSSE2() && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && "Don't know how to lower!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Don't know how to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25722, __extension__ __PRETTY_FUNCTION__)); | |||
| 25723 | ||||
| 25724 | // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle | |||
| 25725 | // the odd elements over the even elements. | |||
| 25726 | if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) { | |||
| 25727 | Op0 = DAG.getConstant(0, dl, MVT::v4i32); | |||
| 25728 | Op1 = DAG.getBitcast(MVT::v4i32, Op1); | |||
| 25729 | ||||
| 25730 | SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); | |||
| 25731 | static const int MaskHi[] = { 1, 1, 3, 3 }; | |||
| 25732 | SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); | |||
| 25733 | ||||
| 25734 | return DAG.getBitcast(VT, Result); | |||
| 25735 | } | |||
| 25736 | ||||
| 25737 | if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) { | |||
| 25738 | Op0 = DAG.getBitcast(MVT::v4i32, Op0); | |||
| 25739 | Op1 = DAG.getConstant(-1, dl, MVT::v4i32); | |||
| 25740 | ||||
| 25741 | SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); | |||
| 25742 | static const int MaskHi[] = { 1, 1, 3, 3 }; | |||
| 25743 | SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); | |||
| 25744 | ||||
| 25745 | return DAG.getBitcast(VT, Result); | |||
| 25746 | } | |||
| 25747 | ||||
| 25748 | // Since SSE has no unsigned integer comparisons, we need to flip the sign | |||
| 25749 | // bits of the inputs before performing those operations. The lower | |||
| 25750 | // compare is always unsigned. | |||
| 25751 | SDValue SB = DAG.getConstant(FlipSigns ? 0x8000000080000000ULL | |||
| 25752 | : 0x0000000080000000ULL, | |||
| 25753 | dl, MVT::v2i64); | |||
| 25754 | ||||
| 25755 | Op0 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op0, SB); | |||
| 25756 | Op1 = DAG.getNode(ISD::XOR, dl, MVT::v2i64, Op1, SB); | |||
| 25757 | ||||
| 25758 | // Cast everything to the right type. | |||
| 25759 | Op0 = DAG.getBitcast(MVT::v4i32, Op0); | |||
| 25760 | Op1 = DAG.getBitcast(MVT::v4i32, Op1); | |||
| 25761 | ||||
| 25762 | // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2)) | |||
| 25763 | SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1); | |||
| 25764 | SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1); | |||
| 25765 | ||||
| 25766 | // Create masks for only the low parts/high parts of the 64 bit integers. | |||
| 25767 | static const int MaskHi[] = { 1, 1, 3, 3 }; | |||
| 25768 | static const int MaskLo[] = { 0, 0, 2, 2 }; | |||
| 25769 | SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi); | |||
| 25770 | SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo); | |||
| 25771 | SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi); | |||
| 25772 | ||||
| 25773 | SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo); | |||
| 25774 | Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi); | |||
| 25775 | ||||
| 25776 | if (Invert) | |||
| 25777 | Result = DAG.getNOT(dl, Result, MVT::v4i32); | |||
| 25778 | ||||
| 25779 | return DAG.getBitcast(VT, Result); | |||
| 25780 | } | |||
| 25781 | ||||
| 25782 | if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) { | |||
| 25783 | // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with | |||
| 25784 | // pcmpeqd + pshufd + pand. | |||
| 25785 | assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!")(static_cast <bool> (Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!") ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !FlipSigns && \"Don't know how to lower!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25785, __extension__ __PRETTY_FUNCTION__)); | |||
| 25786 | ||||
| 25787 | // First cast everything to the right type. | |||
| 25788 | Op0 = DAG.getBitcast(MVT::v4i32, Op0); | |||
| 25789 | Op1 = DAG.getBitcast(MVT::v4i32, Op1); | |||
| 25790 | ||||
| 25791 | // Do the compare. | |||
| 25792 | SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1); | |||
| 25793 | ||||
| 25794 | // Make sure the lower and upper halves are both all-ones. | |||
| 25795 | static const int Mask[] = { 1, 0, 3, 2 }; | |||
| 25796 | SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask); | |||
| 25797 | Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf); | |||
| 25798 | ||||
| 25799 | if (Invert) | |||
| 25800 | Result = DAG.getNOT(dl, Result, MVT::v4i32); | |||
| 25801 | ||||
| 25802 | return DAG.getBitcast(VT, Result); | |||
| 25803 | } | |||
| 25804 | } | |||
| 25805 | ||||
| 25806 | // Since SSE has no unsigned integer comparisons, we need to flip the sign | |||
| 25807 | // bits of the inputs before performing those operations. | |||
| 25808 | if (FlipSigns) { | |||
| 25809 | MVT EltVT = VT.getVectorElementType(); | |||
| 25810 | SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl, | |||
| 25811 | VT); | |||
| 25812 | Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM); | |||
| 25813 | Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM); | |||
| 25814 | } | |||
| 25815 | ||||
| 25816 | SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1); | |||
| 25817 | ||||
| 25818 | // If the logical-not of the result is required, perform that now. | |||
| 25819 | if (Invert) | |||
| 25820 | Result = DAG.getNOT(dl, Result, VT); | |||
| 25821 | ||||
| 25822 | return Result; | |||
| 25823 | } | |||
| 25824 | ||||
| 25825 | // Try to select this as a KORTEST+SETCC or KTEST+SETCC if possible. | |||
| 25826 | static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC, | |||
| 25827 | const SDLoc &dl, SelectionDAG &DAG, | |||
| 25828 | const X86Subtarget &Subtarget, | |||
| 25829 | SDValue &X86CC) { | |||
| 25830 | assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode")(static_cast <bool> ((CC == ISD::SETEQ || CC == ISD::SETNE ) && "Unsupported ISD::CondCode") ? void (0) : __assert_fail ("(CC == ISD::SETEQ || CC == ISD::SETNE) && \"Unsupported ISD::CondCode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25830, __extension__ __PRETTY_FUNCTION__)); | |||
| 25831 | ||||
| 25832 | // Must be a bitcast from vXi1. | |||
| 25833 | if (Op0.getOpcode() != ISD::BITCAST) | |||
| 25834 | return SDValue(); | |||
| 25835 | ||||
| 25836 | Op0 = Op0.getOperand(0); | |||
| 25837 | MVT VT = Op0.getSimpleValueType(); | |||
| 25838 | if (!(Subtarget.hasAVX512() && VT == MVT::v16i1) && | |||
| 25839 | !(Subtarget.hasDQI() && VT == MVT::v8i1) && | |||
| 25840 | !(Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1))) | |||
| 25841 | return SDValue(); | |||
| 25842 | ||||
| 25843 | X86::CondCode X86Cond; | |||
| 25844 | if (isNullConstant(Op1)) { | |||
| 25845 | X86Cond = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE; | |||
| 25846 | } else if (isAllOnesConstant(Op1)) { | |||
| 25847 | // C flag is set for all ones. | |||
| 25848 | X86Cond = CC == ISD::SETEQ ? X86::COND_B : X86::COND_AE; | |||
| 25849 | } else | |||
| 25850 | return SDValue(); | |||
| 25851 | ||||
| 25852 | // If the input is an AND, we can combine it's operands into the KTEST. | |||
| 25853 | bool KTestable = false; | |||
| 25854 | if (Subtarget.hasDQI() && (VT == MVT::v8i1 || VT == MVT::v16i1)) | |||
| 25855 | KTestable = true; | |||
| 25856 | if (Subtarget.hasBWI() && (VT == MVT::v32i1 || VT == MVT::v64i1)) | |||
| 25857 | KTestable = true; | |||
| 25858 | if (!isNullConstant(Op1)) | |||
| 25859 | KTestable = false; | |||
| 25860 | if (KTestable && Op0.getOpcode() == ISD::AND && Op0.hasOneUse()) { | |||
| 25861 | SDValue LHS = Op0.getOperand(0); | |||
| 25862 | SDValue RHS = Op0.getOperand(1); | |||
| 25863 | X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); | |||
| 25864 | return DAG.getNode(X86ISD::KTEST, dl, MVT::i32, LHS, RHS); | |||
| 25865 | } | |||
| 25866 | ||||
| 25867 | // If the input is an OR, we can combine it's operands into the KORTEST. | |||
| 25868 | SDValue LHS = Op0; | |||
| 25869 | SDValue RHS = Op0; | |||
| 25870 | if (Op0.getOpcode() == ISD::OR && Op0.hasOneUse()) { | |||
| 25871 | LHS = Op0.getOperand(0); | |||
| 25872 | RHS = Op0.getOperand(1); | |||
| 25873 | } | |||
| 25874 | ||||
| 25875 | X86CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8); | |||
| 25876 | return DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS); | |||
| 25877 | } | |||
| 25878 | ||||
| 25879 | /// Emit flags for the given setcc condition and operands. Also returns the | |||
| 25880 | /// corresponding X86 condition code constant in X86CC. | |||
| 25881 | SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1, | |||
| 25882 | ISD::CondCode CC, const SDLoc &dl, | |||
| 25883 | SelectionDAG &DAG, | |||
| 25884 | SDValue &X86CC) const { | |||
| 25885 | // Equality Combines. | |||
| 25886 | if (CC == ISD::SETEQ || CC == ISD::SETNE) { | |||
| 25887 | X86::CondCode X86CondCode; | |||
| 25888 | ||||
| 25889 | // Optimize to BT if possible. | |||
| 25890 | // Lower (X & (1 << N)) == 0 to BT(X, N). | |||
| 25891 | // Lower ((X >>u N) & 1) != 0 to BT(X, N). | |||
| 25892 | // Lower ((X >>s N) & 1) != 0 to BT(X, N). | |||
| 25893 | if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1)) { | |||
| 25894 | if (SDValue BT = LowerAndToBT(Op0, CC, dl, DAG, X86CondCode)) { | |||
| 25895 | X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); | |||
| 25896 | return BT; | |||
| 25897 | } | |||
| 25898 | } | |||
| 25899 | ||||
| 25900 | // Try to use PTEST/PMOVMSKB for a tree AND/ORs equality compared with -1/0. | |||
| 25901 | if (SDValue CmpZ = MatchVectorAllEqualTest(Op0, Op1, CC, dl, Subtarget, DAG, | |||
| 25902 | X86CondCode)) { | |||
| 25903 | X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); | |||
| 25904 | return CmpZ; | |||
| 25905 | } | |||
| 25906 | ||||
| 25907 | // Try to lower using KORTEST or KTEST. | |||
| 25908 | if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC)) | |||
| 25909 | return Test; | |||
| 25910 | ||||
| 25911 | // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms | |||
| 25912 | // of these. | |||
| 25913 | if (isOneConstant(Op1) || isNullConstant(Op1)) { | |||
| 25914 | // If the input is a setcc, then reuse the input setcc or use a new one | |||
| 25915 | // with the inverted condition. | |||
| 25916 | if (Op0.getOpcode() == X86ISD::SETCC) { | |||
| 25917 | bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1); | |||
| 25918 | ||||
| 25919 | X86CC = Op0.getOperand(0); | |||
| 25920 | if (Invert) { | |||
| 25921 | X86CondCode = (X86::CondCode)Op0.getConstantOperandVal(0); | |||
| 25922 | X86CondCode = X86::GetOppositeBranchCondition(X86CondCode); | |||
| 25923 | X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); | |||
| 25924 | } | |||
| 25925 | ||||
| 25926 | return Op0.getOperand(1); | |||
| 25927 | } | |||
| 25928 | } | |||
| 25929 | ||||
| 25930 | // Try to use the carry flag from the add in place of an separate CMP for: | |||
| 25931 | // (seteq (add X, -1), -1). Similar for setne. | |||
| 25932 | if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD && | |||
| 25933 | Op0.getOperand(1) == Op1) { | |||
| 25934 | if (isProfitableToUseFlagOp(Op0)) { | |||
| 25935 | SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); | |||
| 25936 | ||||
| 25937 | SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0), | |||
| 25938 | Op0.getOperand(1)); | |||
| 25939 | DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New); | |||
| 25940 | X86CondCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B; | |||
| 25941 | X86CC = DAG.getTargetConstant(X86CondCode, dl, MVT::i8); | |||
| 25942 | return SDValue(New.getNode(), 1); | |||
| 25943 | } | |||
| 25944 | } | |||
| 25945 | } | |||
| 25946 | ||||
| 25947 | X86::CondCode CondCode = | |||
| 25948 | TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG); | |||
| 25949 | assert(CondCode != X86::COND_INVALID && "Unexpected condition code!")(static_cast <bool> (CondCode != X86::COND_INVALID && "Unexpected condition code!") ? void (0) : __assert_fail ("CondCode != X86::COND_INVALID && \"Unexpected condition code!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25949, __extension__ __PRETTY_FUNCTION__)); | |||
| 25950 | ||||
| 25951 | SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget); | |||
| 25952 | X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); | |||
| 25953 | return EFLAGS; | |||
| 25954 | } | |||
| 25955 | ||||
| 25956 | SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { | |||
| 25957 | ||||
| 25958 | bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC || | |||
| 25959 | Op.getOpcode() == ISD::STRICT_FSETCCS; | |||
| 25960 | MVT VT = Op->getSimpleValueType(0); | |||
| 25961 | ||||
| 25962 | if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG); | |||
| 25963 | ||||
| 25964 | assert(VT == MVT::i8 && "SetCC type must be 8-bit integer")(static_cast <bool> (VT == MVT::i8 && "SetCC type must be 8-bit integer" ) ? void (0) : __assert_fail ("VT == MVT::i8 && \"SetCC type must be 8-bit integer\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25964, __extension__ __PRETTY_FUNCTION__)); | |||
| 25965 | SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue(); | |||
| 25966 | SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); | |||
| 25967 | SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1); | |||
| 25968 | SDLoc dl(Op); | |||
| 25969 | ISD::CondCode CC = | |||
| 25970 | cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get(); | |||
| 25971 | ||||
| 25972 | if (isSoftFP16(Op0.getValueType())) | |||
| 25973 | return SDValue(); | |||
| 25974 | ||||
| 25975 | // Handle f128 first, since one possible outcome is a normal integer | |||
| 25976 | // comparison which gets handled by emitFlagsForSetcc. | |||
| 25977 | if (Op0.getValueType() == MVT::f128) { | |||
| 25978 | softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain, | |||
| 25979 | Op.getOpcode() == ISD::STRICT_FSETCCS); | |||
| 25980 | ||||
| 25981 | // If softenSetCCOperands returned a scalar, use it. | |||
| 25982 | if (!Op1.getNode()) { | |||
| 25983 | assert(Op0.getValueType() == Op.getValueType() &&(static_cast <bool> (Op0.getValueType() == Op.getValueType () && "Unexpected setcc expansion!") ? void (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__ __PRETTY_FUNCTION__)) | |||
| 25984 | "Unexpected setcc expansion!")(static_cast <bool> (Op0.getValueType() == Op.getValueType () && "Unexpected setcc expansion!") ? void (0) : __assert_fail ("Op0.getValueType() == Op.getValueType() && \"Unexpected setcc expansion!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 25984, __extension__ __PRETTY_FUNCTION__)); | |||
| 25985 | if (IsStrict) | |||
| 25986 | return DAG.getMergeValues({Op0, Chain}, dl); | |||
| 25987 | return Op0; | |||
| 25988 | } | |||
| 25989 | } | |||
| 25990 | ||||
| 25991 | if (Op0.getSimpleValueType().isInteger()) { | |||
| 25992 | // Attempt to canonicalize SGT/UGT -> SGE/UGE compares with constant which | |||
| 25993 | // reduces the number of EFLAGs bit reads (the GE conditions don't read ZF), | |||
| 25994 | // this may translate to less uops depending on uarch implementation. The | |||
| 25995 | // equivalent for SLE/ULE -> SLT/ULT isn't likely to happen as we already | |||
| 25996 | // canonicalize to that CondCode. | |||
| 25997 | // NOTE: Only do this if incrementing the constant doesn't increase the bit | |||
| 25998 | // encoding size - so it must either already be a i8 or i32 immediate, or it | |||
| 25999 | // shrinks down to that. We don't do this for any i64's to avoid additional | |||
| 26000 | // constant materializations. | |||
| 26001 | // TODO: Can we move this to TranslateX86CC to handle jumps/branches too? | |||
| 26002 | if (auto *Op1C = dyn_cast<ConstantSDNode>(Op1)) { | |||
| 26003 | const APInt &Op1Val = Op1C->getAPIntValue(); | |||
| 26004 | if (!Op1Val.isZero()) { | |||
| 26005 | // Ensure the constant+1 doesn't overflow. | |||
| 26006 | if ((CC == ISD::CondCode::SETGT && !Op1Val.isMaxSignedValue()) || | |||
| 26007 | (CC == ISD::CondCode::SETUGT && !Op1Val.isMaxValue())) { | |||
| 26008 | APInt Op1ValPlusOne = Op1Val + 1; | |||
| 26009 | if (Op1ValPlusOne.isSignedIntN(32) && | |||
| 26010 | (!Op1Val.isSignedIntN(8) || Op1ValPlusOne.isSignedIntN(8))) { | |||
| 26011 | Op1 = DAG.getConstant(Op1ValPlusOne, dl, Op0.getValueType()); | |||
| 26012 | CC = CC == ISD::CondCode::SETGT ? ISD::CondCode::SETGE | |||
| 26013 | : ISD::CondCode::SETUGE; | |||
| 26014 | } | |||
| 26015 | } | |||
| 26016 | } | |||
| 26017 | } | |||
| 26018 | ||||
| 26019 | SDValue X86CC; | |||
| 26020 | SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC); | |||
| 26021 | SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); | |||
| 26022 | return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; | |||
| 26023 | } | |||
| 26024 | ||||
| 26025 | // Handle floating point. | |||
| 26026 | X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG); | |||
| 26027 | if (CondCode == X86::COND_INVALID) | |||
| 26028 | return SDValue(); | |||
| 26029 | ||||
| 26030 | SDValue EFLAGS; | |||
| 26031 | if (IsStrict) { | |||
| 26032 | bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS; | |||
| 26033 | EFLAGS = | |||
| 26034 | DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP, | |||
| 26035 | dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1}); | |||
| 26036 | Chain = EFLAGS.getValue(1); | |||
| 26037 | } else { | |||
| 26038 | EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1); | |||
| 26039 | } | |||
| 26040 | ||||
| 26041 | SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8); | |||
| 26042 | SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS); | |||
| 26043 | return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res; | |||
| 26044 | } | |||
| 26045 | ||||
| 26046 | SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { | |||
| 26047 | SDValue LHS = Op.getOperand(0); | |||
| 26048 | SDValue RHS = Op.getOperand(1); | |||
| 26049 | SDValue Carry = Op.getOperand(2); | |||
| 26050 | SDValue Cond = Op.getOperand(3); | |||
| 26051 | SDLoc DL(Op); | |||
| 26052 | ||||
| 26053 | assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.")(static_cast <bool> (LHS.getSimpleValueType().isInteger () && "SETCCCARRY is integer only.") ? void (0) : __assert_fail ("LHS.getSimpleValueType().isInteger() && \"SETCCCARRY is integer only.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26053, __extension__ __PRETTY_FUNCTION__)); | |||
| 26054 | X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get()); | |||
| 26055 | ||||
| 26056 | // Recreate the carry if needed. | |||
| 26057 | EVT CarryVT = Carry.getValueType(); | |||
| 26058 | Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), | |||
| 26059 | Carry, DAG.getAllOnesConstant(DL, CarryVT)); | |||
| 26060 | ||||
| 26061 | SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); | |||
| 26062 | SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); | |||
| 26063 | return getSETCC(CC, Cmp.getValue(1), DL, DAG); | |||
| 26064 | } | |||
| 26065 | ||||
| 26066 | // This function returns three things: the arithmetic computation itself | |||
| 26067 | // (Value), an EFLAGS result (Overflow), and a condition code (Cond). The | |||
| 26068 | // flag and the condition code define the case in which the arithmetic | |||
| 26069 | // computation overflows. | |||
| 26070 | static std::pair<SDValue, SDValue> | |||
| 26071 | getX86XALUOOp(X86::CondCode &Cond, SDValue Op, SelectionDAG &DAG) { | |||
| 26072 | assert(Op.getResNo() == 0 && "Unexpected result number!")(static_cast <bool> (Op.getResNo() == 0 && "Unexpected result number!" ) ? void (0) : __assert_fail ("Op.getResNo() == 0 && \"Unexpected result number!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26072, __extension__ __PRETTY_FUNCTION__)); | |||
| 26073 | SDValue Value, Overflow; | |||
| 26074 | SDValue LHS = Op.getOperand(0); | |||
| 26075 | SDValue RHS = Op.getOperand(1); | |||
| 26076 | unsigned BaseOp = 0; | |||
| 26077 | SDLoc DL(Op); | |||
| 26078 | switch (Op.getOpcode()) { | |||
| 26079 | default: llvm_unreachable("Unknown ovf instruction!")::llvm::llvm_unreachable_internal("Unknown ovf instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp", 26079); | |||
| 26080 | case ISD::SADDO: | |||
| 26081 | BaseOp = X86ISD::ADD; | |||
| 26082 | Cond = X86::COND_O; | |||
| 26083 | break; | |||
| 26084 | case ISD::UADDO: | |||
| 26085 | BaseOp = X86ISD::ADD; | |||
| 26086 | Cond = isOneConstant(RHS) ? X86::COND_E : X86::COND_B; | |||
| 26087 | break; | |||
| 26088 | case ISD::SSUBO: | |||
| 26089 | BaseOp = X86ISD::SUB; | |||
| 26090 | Cond = X86::COND_O; | |||
| 26091 | break; | |||
| 26092 | case ISD::USUBO: | |||
| 26093 | BaseOp = X86ISD::SUB; | |||
| 26094 | Cond = X86::COND_B; | |||
| 26095 | break; | |||
| 26096 | case ISD::SMULO: | |||
| 26097 | BaseOp = X86ISD::SMUL; | |||
| 26098 | Cond = X86::COND_O; | |||
| 26099 | break; | |||
| 26100 | case ISD::UMULO: | |||
| 26101 | BaseOp = X86ISD::UMUL; | |||
| 26102 | Cond = X86::COND_O; | |||
| 26103 | break; | |||
| 26104 | } | |||
| 26105 | ||||
| 26106 | if (BaseOp) { | |||
| 26107 | // Also sets EFLAGS. | |||
| 26108 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32); | |||
| 26109 | Value = DAG.getNode(BaseOp, DL, VTs, LHS, RHS); | |||
| 26110 | Overflow = Value.getValue(1); | |||
| 26111 | } | |||
| 26112 | ||||
| 26113 | return std::make_pair(Value, Overflow); | |||
| 26114 | } | |||
| 26115 | ||||
| 26116 | static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { | |||
| 26117 | // Lower the "add/sub/mul with overflow" instruction into a regular ins plus | |||
| 26118 | // a "setcc" instruction that checks the overflow flag. The "brcond" lowering | |||
| 26119 | // looks for this combo and may remove the "setcc" instruction if the "setcc" | |||
| 26120 | // has only one use. | |||
| 26121 | SDLoc DL(Op); | |||
| 26122 | X86::CondCode Cond; | |||
| 26123 | SDValue Value, Overflow; | |||
| 26124 | std::tie(Value, Overflow) = getX86XALUOOp(Cond, Op, DAG); | |||
| 26125 | ||||
| 26126 | SDValue SetCC = getSETCC(Cond, Overflow, DL, DAG); | |||
| 26127 | assert(Op->getValueType(1) == MVT::i8 && "Unexpected VT!")(static_cast <bool> (Op->getValueType(1) == MVT::i8 && "Unexpected VT!") ? void (0) : __assert_fail ("Op->getValueType(1) == MVT::i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26127, __extension__ __PRETTY_FUNCTION__)); | |||
| 26128 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Value, SetCC); | |||
| 26129 | } | |||
| 26130 | ||||
| 26131 | /// Return true if opcode is a X86 logical comparison. | |||
| 26132 | static bool isX86LogicalCmp(SDValue Op) { | |||
| 26133 | unsigned Opc = Op.getOpcode(); | |||
| 26134 | if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI || | |||
| 26135 | Opc == X86ISD::FCMP) | |||
| 26136 | return true; | |||
| 26137 | if (Op.getResNo() == 1 && | |||
| 26138 | (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || | |||
| 26139 | Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || | |||
| 26140 | Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) | |||
| 26141 | return true; | |||
| 26142 | ||||
| 26143 | return false; | |||
| 26144 | } | |||
| 26145 | ||||
| 26146 | static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) { | |||
| 26147 | if (V.getOpcode() != ISD::TRUNCATE) | |||
| 26148 | return false; | |||
| 26149 | ||||
| 26150 | SDValue VOp0 = V.getOperand(0); | |||
| 26151 | unsigned InBits = VOp0.getValueSizeInBits(); | |||
| 26152 | unsigned Bits = V.getValueSizeInBits(); | |||
| 26153 | return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits)); | |||
| 26154 | } | |||
| 26155 | ||||
| 26156 | SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { | |||
| 26157 | bool AddTest = true; | |||
| 26158 | SDValue Cond = Op.getOperand(0); | |||
| 26159 | SDValue Op1 = Op.getOperand(1); | |||
| 26160 | SDValue Op2 = Op.getOperand(2); | |||
| 26161 | SDLoc DL(Op); | |||
| 26162 | MVT VT = Op1.getSimpleValueType(); | |||
| 26163 | SDValue CC; | |||
| 26164 | ||||
| 26165 | if (isSoftFP16(VT)) { | |||
| 26166 | MVT NVT = VT.changeTypeToInteger(); | |||
| 26167 | return DAG.getBitcast(VT, DAG.getNode(ISD::SELECT, DL, NVT, Cond, | |||
| 26168 | DAG.getBitcast(NVT, Op1), | |||
| 26169 | DAG.getBitcast(NVT, Op2))); | |||
| 26170 | } | |||
| 26171 | ||||
| 26172 | // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops | |||
| 26173 | // are available or VBLENDV if AVX is available. | |||
| 26174 | // Otherwise FP cmovs get lowered into a less efficient branch sequence later. | |||
| 26175 | if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) && | |||
| 26176 | VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { | |||
| 26177 | SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); | |||
| 26178 | bool IsAlwaysSignaling; | |||
| 26179 | unsigned SSECC = | |||
| 26180 | translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(), | |||
| 26181 | CondOp0, CondOp1, IsAlwaysSignaling); | |||
| 26182 | ||||
| 26183 | if (Subtarget.hasAVX512()) { | |||
| 26184 | SDValue Cmp = | |||
| 26185 | DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, CondOp1, | |||
| 26186 | DAG.getTargetConstant(SSECC, DL, MVT::i8)); | |||
| 26187 | assert(!VT.isVector() && "Not a scalar type?")(static_cast <bool> (!VT.isVector() && "Not a scalar type?" ) ? void (0) : __assert_fail ("!VT.isVector() && \"Not a scalar type?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26187, __extension__ __PRETTY_FUNCTION__)); | |||
| 26188 | return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); | |||
| 26189 | } | |||
| 26190 | ||||
| 26191 | if (SSECC < 8 || Subtarget.hasAVX()) { | |||
| 26192 | SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, | |||
| 26193 | DAG.getTargetConstant(SSECC, DL, MVT::i8)); | |||
| 26194 | ||||
| 26195 | // If we have AVX, we can use a variable vector select (VBLENDV) instead | |||
| 26196 | // of 3 logic instructions for size savings and potentially speed. | |||
| 26197 | // Unfortunately, there is no scalar form of VBLENDV. | |||
| 26198 | ||||
| 26199 | // If either operand is a +0.0 constant, don't try this. We can expect to | |||
| 26200 | // optimize away at least one of the logic instructions later in that | |||
| 26201 | // case, so that sequence would be faster than a variable blend. | |||
| 26202 | ||||
| 26203 | // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly | |||
| 26204 | // uses XMM0 as the selection register. That may need just as many | |||
| 26205 | // instructions as the AND/ANDN/OR sequence due to register moves, so | |||
| 26206 | // don't bother. | |||
| 26207 | if (Subtarget.hasAVX() && !isNullFPConstant(Op1) && | |||
| 26208 | !isNullFPConstant(Op2)) { | |||
| 26209 | // Convert to vectors, do a VSELECT, and convert back to scalar. | |||
| 26210 | // All of the conversions should be optimized away. | |||
| 26211 | MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64; | |||
| 26212 | SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1); | |||
| 26213 | SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2); | |||
| 26214 | SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp); | |||
| 26215 | ||||
| 26216 | MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; | |||
| 26217 | VCmp = DAG.getBitcast(VCmpVT, VCmp); | |||
| 26218 | ||||
| 26219 | SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); | |||
| 26220 | ||||
| 26221 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, | |||
| 26222 | VSel, DAG.getIntPtrConstant(0, DL)); | |||
| 26223 | } | |||
| 26224 | SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2); | |||
| 26225 | SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1); | |||
| 26226 | return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And); | |||
| 26227 | } | |||
| 26228 | } | |||
| 26229 | ||||
| 26230 | // AVX512 fallback is to lower selects of scalar floats to masked moves. | |||
| 26231 | if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) { | |||
| 26232 | SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond); | |||
| 26233 | return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); | |||
| 26234 | } | |||
| 26235 | ||||
| 26236 | if (Cond.getOpcode() == ISD::SETCC && | |||
| 26237 | !isSoftFP16(Cond.getOperand(0).getSimpleValueType())) { | |||
| 26238 | if (SDValue NewCond = LowerSETCC(Cond, DAG)) { | |||
| 26239 | Cond = NewCond; | |||
| 26240 | // If the condition was updated, it's possible that the operands of the | |||
| 26241 | // select were also updated (for example, EmitTest has a RAUW). Refresh | |||
| 26242 | // the local references to the select operands in case they got stale. | |||
| 26243 | Op1 = Op.getOperand(1); | |||
| 26244 | Op2 = Op.getOperand(2); | |||
| 26245 | } | |||
| 26246 | } | |||
| 26247 | ||||
| 26248 | // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y | |||
| 26249 | // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y | |||
| 26250 | // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y | |||
| 26251 | // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y | |||
| 26252 | // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y | |||
| 26253 | // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y | |||
| 26254 | // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x | |||
| 26255 | // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x | |||
| 26256 | if (Cond.getOpcode() == X86ISD::SETCC && | |||
| 26257 | Cond.getOperand(1).getOpcode() == X86ISD::CMP && | |||
| 26258 | isNullConstant(Cond.getOperand(1).getOperand(1))) { | |||
| 26259 | SDValue Cmp = Cond.getOperand(1); | |||
| 26260 | SDValue CmpOp0 = Cmp.getOperand(0); | |||
| 26261 | unsigned CondCode = Cond.getConstantOperandVal(0); | |||
| 26262 | ||||
| 26263 | // Special handling for __builtin_ffs(X) - 1 pattern which looks like | |||
| 26264 | // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special | |||
| 26265 | // handle to keep the CMP with 0. This should be removed by | |||
| 26266 | // optimizeCompareInst by using the flags from the BSR/TZCNT used for the | |||
| 26267 | // cttz_zero_undef. | |||
| 26268 | auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) { | |||
| 26269 | return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() && | |||
| 26270 | Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2)); | |||
| 26271 | }; | |||
| 26272 | if (Subtarget.canUseCMOV() && (VT == MVT::i32 || VT == MVT::i64) && | |||
| 26273 | ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) || | |||
| 26274 | (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) { | |||
| 26275 | // Keep Cmp. | |||
| 26276 | } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && | |||
| 26277 | (CondCode == X86::COND_E || CondCode == X86::COND_NE)) { | |||
| 26278 | SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2; | |||
| 26279 | SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32); | |||
| 26280 | ||||
| 26281 | // 'X - 1' sets the carry flag if X == 0. | |||
| 26282 | // '0 - X' sets the carry flag if X != 0. | |||
| 26283 | // Convert the carry flag to a -1/0 mask with sbb: | |||
| 26284 | // select (X != 0), -1, Y --> 0 - X; or (sbb), Y | |||
| 26285 | // select (X == 0), Y, -1 --> 0 - X; or (sbb), Y | |||
| 26286 | // select (X != 0), Y, -1 --> X - 1; or (sbb), Y | |||
| 26287 | // select (X == 0), -1, Y --> X - 1; or (sbb), Y | |||
| 26288 | SDValue Sub; | |||
| 26289 | if (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE)) { | |||
| 26290 | SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType()); | |||
| 26291 | Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0); | |||
| 26292 | } else { | |||
| 26293 | SDValue One = DAG.getConstant(1, DL, CmpOp0.getValueType()); | |||
| 26294 | Sub = DAG.getNode(X86ISD::SUB, DL, CmpVTs, CmpOp0, One); | |||
| 26295 | } | |||
| 26296 | SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 26297 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), | |||
| 26298 | Sub.getValue(1)); | |||
| 26299 | return DAG.getNode(ISD::OR, DL, VT, SBB, Y); | |||
| 26300 | } else if (!Subtarget.canUseCMOV() && CondCode == X86::COND_E && | |||
| 26301 | CmpOp0.getOpcode() == ISD::AND && | |||
| 26302 | isOneConstant(CmpOp0.getOperand(1))) { | |||
| 26303 | SDValue Src1, Src2; | |||
| 26304 | // true if Op2 is XOR or OR operator and one of its operands | |||
| 26305 | // is equal to Op1 | |||
| 26306 | // ( a , a op b) || ( b , a op b) | |||
| 26307 | auto isOrXorPattern = [&]() { | |||
| 26308 | if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) && | |||
| 26309 | (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) { | |||
| 26310 | Src1 = | |||
| 26311 | Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0); | |||
| 26312 | Src2 = Op1; | |||
| 26313 | return true; | |||
| 26314 | } | |||
| 26315 | return false; | |||
| 26316 | }; | |||
| 26317 | ||||
| 26318 | if (isOrXorPattern()) { | |||
| 26319 | SDValue Neg; | |||
| 26320 | unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits(); | |||
| 26321 | // we need mask of all zeros or ones with same size of the other | |||
| 26322 | // operands. | |||
| 26323 | if (CmpSz > VT.getSizeInBits()) | |||
| 26324 | Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0); | |||
| 26325 | else if (CmpSz < VT.getSizeInBits()) | |||
| 26326 | Neg = DAG.getNode(ISD::AND, DL, VT, | |||
| 26327 | DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)), | |||
| 26328 | DAG.getConstant(1, DL, VT)); | |||
| 26329 | else | |||
| 26330 | Neg = CmpOp0; | |||
| 26331 | SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), | |||
| 26332 | Neg); // -(and (x, 0x1)) | |||
| 26333 | SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z | |||
| 26334 | return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y | |||
| 26335 | } | |||
| 26336 | } else if ((VT == MVT::i32 || VT == MVT::i64) && isNullConstant(Op2) && | |||
| 26337 | Cmp.getNode()->hasOneUse() && (CmpOp0 == Op1) && | |||
| 26338 | ((CondCode == X86::COND_S) || // smin(x, 0) | |||
| 26339 | (CondCode == X86::COND_G && hasAndNot(Op1)))) { // smax(x, 0) | |||
| 26340 | // (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x | |||
| 26341 | // | |||
| 26342 | // If the comparison is testing for a positive value, we have to invert | |||
| 26343 | // the sign bit mask, so only do that transform if the target has a | |||
| 26344 | // bitwise 'and not' instruction (the invert is free). | |||
| 26345 | // (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x | |||
| 26346 | unsigned ShCt = VT.getSizeInBits() - 1; | |||
| 26347 | SDValue ShiftAmt = DAG.getConstant(ShCt, DL, VT); | |||
| 26348 | SDValue Shift = DAG.getNode(ISD::SRA, DL, VT, Op1, ShiftAmt); | |||
| 26349 | if (CondCode == X86::COND_G) | |||
| 26350 | Shift = DAG.getNOT(DL, Shift, VT); | |||
| 26351 | return DAG.getNode(ISD::AND, DL, VT, Shift, Op1); | |||
| 26352 | } | |||
| 26353 | } | |||
| 26354 | ||||
| 26355 | // Look past (and (setcc_carry (cmp ...)), 1). | |||
| 26356 | if (Cond.getOpcode() == ISD::AND && | |||
| 26357 | Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY && | |||
| 26358 | isOneConstant(Cond.getOperand(1))) | |||
| 26359 | Cond = Cond.getOperand(0); | |||
| 26360 | ||||
| 26361 | // If condition flag is set by a X86ISD::CMP, then use it as the condition | |||
| 26362 | // setting operand in place of the X86ISD::SETCC. | |||
| 26363 | unsigned CondOpcode = Cond.getOpcode(); | |||
| 26364 | if (CondOpcode == X86ISD::SETCC || | |||
| 26365 | CondOpcode == X86ISD::SETCC_CARRY) { | |||
| 26366 | CC = Cond.getOperand(0); | |||
| 26367 | ||||
| 26368 | SDValue Cmp = Cond.getOperand(1); | |||
| 26369 | bool IllegalFPCMov = false; | |||
| 26370 | if (VT.isFloatingPoint() && !VT.isVector() && | |||
| 26371 | !isScalarFPTypeInSSEReg(VT) && Subtarget.canUseCMOV()) // FPStack? | |||
| 26372 | IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue()); | |||
| 26373 | ||||
| 26374 | if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) || | |||
| 26375 | Cmp.getOpcode() == X86ISD::BT) { // FIXME | |||
| 26376 | Cond = Cmp; | |||
| 26377 | AddTest = false; | |||
| 26378 | } | |||
| 26379 | } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO || | |||
| 26380 | CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO || | |||
| 26381 | CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) { | |||
| 26382 | SDValue Value; | |||
| 26383 | X86::CondCode X86Cond; | |||
| 26384 | std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); | |||
| 26385 | ||||
| 26386 | CC = DAG.getTargetConstant(X86Cond, DL, MVT::i8); | |||
| 26387 | AddTest = false; | |||
| 26388 | } | |||
| 26389 | ||||
| 26390 | if (AddTest) { | |||
| 26391 | // Look past the truncate if the high bits are known zero. | |||
| 26392 | if (isTruncWithZeroHighBitsInput(Cond, DAG)) | |||
| 26393 | Cond = Cond.getOperand(0); | |||
| 26394 | ||||
| 26395 | // We know the result of AND is compared against zero. Try to match | |||
| 26396 | // it to BT. | |||
| 26397 | if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { | |||
| 26398 | X86::CondCode X86CondCode; | |||
| 26399 | if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, DL, DAG, X86CondCode)) { | |||
| 26400 | CC = DAG.getTargetConstant(X86CondCode, DL, MVT::i8); | |||
| 26401 | Cond = BT; | |||
| 26402 | AddTest = false; | |||
| 26403 | } | |||
| 26404 | } | |||
| 26405 | } | |||
| 26406 | ||||
| 26407 | if (AddTest) { | |||
| 26408 | CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8); | |||
| 26409 | Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget); | |||
| 26410 | } | |||
| 26411 | ||||
| 26412 | // a < b ? -1 : 0 -> RES = ~setcc_carry | |||
| 26413 | // a < b ? 0 : -1 -> RES = setcc_carry | |||
| 26414 | // a >= b ? -1 : 0 -> RES = setcc_carry | |||
| 26415 | // a >= b ? 0 : -1 -> RES = ~setcc_carry | |||
| 26416 | if (Cond.getOpcode() == X86ISD::SUB) { | |||
| 26417 | unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue(); | |||
| 26418 | ||||
| 26419 | if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) && | |||
| 26420 | (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) && | |||
| 26421 | (isNullConstant(Op1) || isNullConstant(Op2))) { | |||
| 26422 | SDValue Res = | |||
| 26423 | DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(), | |||
| 26424 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cond); | |||
| 26425 | if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B)) | |||
| 26426 | return DAG.getNOT(DL, Res, Res.getValueType()); | |||
| 26427 | return Res; | |||
| 26428 | } | |||
| 26429 | } | |||
| 26430 | ||||
| 26431 | // X86 doesn't have an i8 cmov. If both operands are the result of a truncate | |||
| 26432 | // widen the cmov and push the truncate through. This avoids introducing a new | |||
| 26433 | // branch during isel and doesn't add any extensions. | |||
| 26434 | if (Op.getValueType() == MVT::i8 && | |||
| 26435 | Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) { | |||
| 26436 | SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0); | |||
| 26437 | if (T1.getValueType() == T2.getValueType() && | |||
| 26438 | // Exclude CopyFromReg to avoid partial register stalls. | |||
| 26439 | T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ | |||
| 26440 | SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, | |||
| 26441 | CC, Cond); | |||
| 26442 | return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); | |||
| 26443 | } | |||
| 26444 | } | |||
| 26445 | ||||
| 26446 | // Or finally, promote i8 cmovs if we have CMOV, | |||
| 26447 | // or i16 cmovs if it won't prevent folding a load. | |||
| 26448 | // FIXME: we should not limit promotion of i8 case to only when the CMOV is | |||
| 26449 | // legal, but EmitLoweredSelect() can not deal with these extensions | |||
| 26450 | // being inserted between two CMOV's. (in i16 case too TBN) | |||
| 26451 | // https://bugs.llvm.org/show_bug.cgi?id=40974 | |||
| 26452 | if ((Op.getValueType() == MVT::i8 && Subtarget.canUseCMOV()) || | |||
| 26453 | (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && | |||
| 26454 | !X86::mayFoldLoad(Op2, Subtarget))) { | |||
| 26455 | Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); | |||
| 26456 | Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); | |||
| 26457 | SDValue Ops[] = { Op2, Op1, CC, Cond }; | |||
| 26458 | SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, MVT::i32, Ops); | |||
| 26459 | return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); | |||
| 26460 | } | |||
| 26461 | ||||
| 26462 | // X86ISD::CMOV means set the result (which is operand 1) to the RHS if | |||
| 26463 | // condition is true. | |||
| 26464 | SDValue Ops[] = { Op2, Op1, CC, Cond }; | |||
| 26465 | return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops); | |||
| 26466 | } | |||
| 26467 | ||||
| 26468 | static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, | |||
| 26469 | const X86Subtarget &Subtarget, | |||
| 26470 | SelectionDAG &DAG) { | |||
| 26471 | MVT VT = Op->getSimpleValueType(0); | |||
| 26472 | SDValue In = Op->getOperand(0); | |||
| 26473 | MVT InVT = In.getSimpleValueType(); | |||
| 26474 | assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!")(static_cast <bool> (InVT.getVectorElementType() == MVT ::i1 && "Unexpected input type!") ? void (0) : __assert_fail ("InVT.getVectorElementType() == MVT::i1 && \"Unexpected input type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26474, __extension__ __PRETTY_FUNCTION__)); | |||
| 26475 | MVT VTElt = VT.getVectorElementType(); | |||
| 26476 | SDLoc dl(Op); | |||
| 26477 | ||||
| 26478 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 26479 | ||||
| 26480 | // Extend VT if the scalar type is i8/i16 and BWI is not supported. | |||
| 26481 | MVT ExtVT = VT; | |||
| 26482 | if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) { | |||
| 26483 | // If v16i32 is to be avoided, we'll need to split and concatenate. | |||
| 26484 | if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) | |||
| 26485 | return SplitAndExtendv16i1(Op.getOpcode(), VT, In, dl, DAG); | |||
| 26486 | ||||
| 26487 | ExtVT = MVT::getVectorVT(MVT::i32, NumElts); | |||
| 26488 | } | |||
| 26489 | ||||
| 26490 | // Widen to 512-bits if VLX is not supported. | |||
| 26491 | MVT WideVT = ExtVT; | |||
| 26492 | if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { | |||
| 26493 | NumElts *= 512 / ExtVT.getSizeInBits(); | |||
| 26494 | InVT = MVT::getVectorVT(MVT::i1, NumElts); | |||
| 26495 | In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), | |||
| 26496 | In, DAG.getIntPtrConstant(0, dl)); | |||
| 26497 | WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); | |||
| 26498 | } | |||
| 26499 | ||||
| 26500 | SDValue V; | |||
| 26501 | MVT WideEltVT = WideVT.getVectorElementType(); | |||
| 26502 | if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || | |||
| 26503 | (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { | |||
| 26504 | V = DAG.getNode(Op.getOpcode(), dl, WideVT, In); | |||
| 26505 | } else { | |||
| 26506 | SDValue NegOne = DAG.getConstant(-1, dl, WideVT); | |||
| 26507 | SDValue Zero = DAG.getConstant(0, dl, WideVT); | |||
| 26508 | V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); | |||
| 26509 | } | |||
| 26510 | ||||
| 26511 | // Truncate if we had to extend i16/i8 above. | |||
| 26512 | if (VT != ExtVT) { | |||
| 26513 | WideVT = MVT::getVectorVT(VTElt, NumElts); | |||
| 26514 | V = DAG.getNode(ISD::TRUNCATE, dl, WideVT, V); | |||
| 26515 | } | |||
| 26516 | ||||
| 26517 | // Extract back to 128/256-bit if we widened. | |||
| 26518 | if (WideVT != VT) | |||
| 26519 | V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, | |||
| 26520 | DAG.getIntPtrConstant(0, dl)); | |||
| 26521 | ||||
| 26522 | return V; | |||
| 26523 | } | |||
| 26524 | ||||
| 26525 | static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, | |||
| 26526 | SelectionDAG &DAG) { | |||
| 26527 | SDValue In = Op->getOperand(0); | |||
| 26528 | MVT InVT = In.getSimpleValueType(); | |||
| 26529 | ||||
| 26530 | if (InVT.getVectorElementType() == MVT::i1) | |||
| 26531 | return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); | |||
| 26532 | ||||
| 26533 | assert(Subtarget.hasAVX() && "Expected AVX support")(static_cast <bool> (Subtarget.hasAVX() && "Expected AVX support" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"Expected AVX support\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26533, __extension__ __PRETTY_FUNCTION__)); | |||
| 26534 | return LowerAVXExtend(Op, DAG, Subtarget); | |||
| 26535 | } | |||
| 26536 | ||||
| 26537 | // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. | |||
| 26538 | // For sign extend this needs to handle all vector sizes and SSE4.1 and | |||
| 26539 | // non-SSE4.1 targets. For zero extend this should only handle inputs of | |||
| 26540 | // MVT::v64i8 when BWI is not supported, but AVX512 is. | |||
| 26541 | static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op, | |||
| 26542 | const X86Subtarget &Subtarget, | |||
| 26543 | SelectionDAG &DAG) { | |||
| 26544 | SDValue In = Op->getOperand(0); | |||
| 26545 | MVT VT = Op->getSimpleValueType(0); | |||
| 26546 | MVT InVT = In.getSimpleValueType(); | |||
| 26547 | ||||
| 26548 | MVT SVT = VT.getVectorElementType(); | |||
| 26549 | MVT InSVT = InVT.getVectorElementType(); | |||
| 26550 | assert(SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits())(static_cast <bool> (SVT.getFixedSizeInBits() > InSVT .getFixedSizeInBits()) ? void (0) : __assert_fail ("SVT.getFixedSizeInBits() > InSVT.getFixedSizeInBits()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26550, __extension__ __PRETTY_FUNCTION__)); | |||
| 26551 | ||||
| 26552 | if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16) | |||
| 26553 | return SDValue(); | |||
| 26554 | if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8) | |||
| 26555 | return SDValue(); | |||
| 26556 | if (!(VT.is128BitVector() && Subtarget.hasSSE2()) && | |||
| 26557 | !(VT.is256BitVector() && Subtarget.hasAVX()) && | |||
| 26558 | !(VT.is512BitVector() && Subtarget.hasAVX512())) | |||
| 26559 | return SDValue(); | |||
| 26560 | ||||
| 26561 | SDLoc dl(Op); | |||
| 26562 | unsigned Opc = Op.getOpcode(); | |||
| 26563 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 26564 | ||||
| 26565 | // For 256-bit vectors, we only need the lower (128-bit) half of the input. | |||
| 26566 | // For 512-bit vectors, we need 128-bits or 256-bits. | |||
| 26567 | if (InVT.getSizeInBits() > 128) { | |||
| 26568 | // Input needs to be at least the same number of elements as output, and | |||
| 26569 | // at least 128-bits. | |||
| 26570 | int InSize = InSVT.getSizeInBits() * NumElts; | |||
| 26571 | In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128)); | |||
| 26572 | InVT = In.getSimpleValueType(); | |||
| 26573 | } | |||
| 26574 | ||||
| 26575 | // SSE41 targets can use the pmov[sz]x* instructions directly for 128-bit results, | |||
| 26576 | // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still | |||
| 26577 | // need to be handled here for 256/512-bit results. | |||
| 26578 | if (Subtarget.hasInt256()) { | |||
| 26579 | assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension")(static_cast <bool> (VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension") ? void (0) : __assert_fail ("VT.getSizeInBits() > 128 && \"Unexpected 128-bit vector extension\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26579, __extension__ __PRETTY_FUNCTION__)); | |||
| 26580 | ||||
| 26581 | if (InVT.getVectorNumElements() != NumElts) | |||
| 26582 | return DAG.getNode(Op.getOpcode(), dl, VT, In); | |||
| 26583 | ||||
| 26584 | // FIXME: Apparently we create inreg operations that could be regular | |||
| 26585 | // extends. | |||
| 26586 | unsigned ExtOpc = | |||
| 26587 | Opc == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SIGN_EXTEND | |||
| 26588 | : ISD::ZERO_EXTEND; | |||
| 26589 | return DAG.getNode(ExtOpc, dl, VT, In); | |||
| 26590 | } | |||
| 26591 | ||||
| 26592 | // pre-AVX2 256-bit extensions need to be split into 128-bit instructions. | |||
| 26593 | if (Subtarget.hasAVX()) { | |||
| 26594 | assert(VT.is256BitVector() && "256-bit vector expected")(static_cast <bool> (VT.is256BitVector() && "256-bit vector expected" ) ? void (0) : __assert_fail ("VT.is256BitVector() && \"256-bit vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26594, __extension__ __PRETTY_FUNCTION__)); | |||
| 26595 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 26596 | int HalfNumElts = HalfVT.getVectorNumElements(); | |||
| 26597 | ||||
| 26598 | unsigned NumSrcElts = InVT.getVectorNumElements(); | |||
| 26599 | SmallVector<int, 16> HiMask(NumSrcElts, SM_SentinelUndef); | |||
| 26600 | for (int i = 0; i != HalfNumElts; ++i) | |||
| 26601 | HiMask[i] = HalfNumElts + i; | |||
| 26602 | ||||
| 26603 | SDValue Lo = DAG.getNode(Opc, dl, HalfVT, In); | |||
| 26604 | SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, DAG.getUNDEF(InVT), HiMask); | |||
| 26605 | Hi = DAG.getNode(Opc, dl, HalfVT, Hi); | |||
| 26606 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 26607 | } | |||
| 26608 | ||||
| 26609 | // We should only get here for sign extend. | |||
| 26610 | assert(Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!")(static_cast <bool> (Opc == ISD::SIGN_EXTEND_VECTOR_INREG && "Unexpected opcode!") ? void (0) : __assert_fail ( "Opc == ISD::SIGN_EXTEND_VECTOR_INREG && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26610, __extension__ __PRETTY_FUNCTION__)); | |||
| 26611 | assert(VT.is128BitVector() && InVT.is128BitVector() && "Unexpected VTs")(static_cast <bool> (VT.is128BitVector() && InVT .is128BitVector() && "Unexpected VTs") ? void (0) : __assert_fail ("VT.is128BitVector() && InVT.is128BitVector() && \"Unexpected VTs\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26611, __extension__ __PRETTY_FUNCTION__)); | |||
| 26612 | ||||
| 26613 | // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI. | |||
| 26614 | SDValue Curr = In; | |||
| 26615 | SDValue SignExt = Curr; | |||
| 26616 | ||||
| 26617 | // As SRAI is only available on i16/i32 types, we expand only up to i32 | |||
| 26618 | // and handle i64 separately. | |||
| 26619 | if (InVT != MVT::v4i32) { | |||
| 26620 | MVT DestVT = VT == MVT::v2i64 ? MVT::v4i32 : VT; | |||
| 26621 | ||||
| 26622 | unsigned DestWidth = DestVT.getScalarSizeInBits(); | |||
| 26623 | unsigned Scale = DestWidth / InSVT.getSizeInBits(); | |||
| 26624 | ||||
| 26625 | unsigned InNumElts = InVT.getVectorNumElements(); | |||
| 26626 | unsigned DestElts = DestVT.getVectorNumElements(); | |||
| 26627 | ||||
| 26628 | // Build a shuffle mask that takes each input element and places it in the | |||
| 26629 | // MSBs of the new element size. | |||
| 26630 | SmallVector<int, 16> Mask(InNumElts, SM_SentinelUndef); | |||
| 26631 | for (unsigned i = 0; i != DestElts; ++i) | |||
| 26632 | Mask[i * Scale + (Scale - 1)] = i; | |||
| 26633 | ||||
| 26634 | Curr = DAG.getVectorShuffle(InVT, dl, In, In, Mask); | |||
| 26635 | Curr = DAG.getBitcast(DestVT, Curr); | |||
| 26636 | ||||
| 26637 | unsigned SignExtShift = DestWidth - InSVT.getSizeInBits(); | |||
| 26638 | SignExt = DAG.getNode(X86ISD::VSRAI, dl, DestVT, Curr, | |||
| 26639 | DAG.getTargetConstant(SignExtShift, dl, MVT::i8)); | |||
| 26640 | } | |||
| 26641 | ||||
| 26642 | if (VT == MVT::v2i64) { | |||
| 26643 | assert(Curr.getValueType() == MVT::v4i32 && "Unexpected input VT")(static_cast <bool> (Curr.getValueType() == MVT::v4i32 && "Unexpected input VT") ? void (0) : __assert_fail ("Curr.getValueType() == MVT::v4i32 && \"Unexpected input VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26643, __extension__ __PRETTY_FUNCTION__)); | |||
| 26644 | SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); | |||
| 26645 | SDValue Sign = DAG.getSetCC(dl, MVT::v4i32, Zero, Curr, ISD::SETGT); | |||
| 26646 | SignExt = DAG.getVectorShuffle(MVT::v4i32, dl, SignExt, Sign, {0, 4, 1, 5}); | |||
| 26647 | SignExt = DAG.getBitcast(VT, SignExt); | |||
| 26648 | } | |||
| 26649 | ||||
| 26650 | return SignExt; | |||
| 26651 | } | |||
| 26652 | ||||
| 26653 | static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, | |||
| 26654 | SelectionDAG &DAG) { | |||
| 26655 | MVT VT = Op->getSimpleValueType(0); | |||
| 26656 | SDValue In = Op->getOperand(0); | |||
| 26657 | MVT InVT = In.getSimpleValueType(); | |||
| 26658 | SDLoc dl(Op); | |||
| 26659 | ||||
| 26660 | if (InVT.getVectorElementType() == MVT::i1) | |||
| 26661 | return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); | |||
| 26662 | ||||
| 26663 | assert(VT.isVector() && InVT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && InVT.isVector () && "Expected vector type") ? void (0) : __assert_fail ("VT.isVector() && InVT.isVector() && \"Expected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26663, __extension__ __PRETTY_FUNCTION__)); | |||
| 26664 | assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Expected same number of elements" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__ __PRETTY_FUNCTION__)) | |||
| 26665 | "Expected same number of elements")(static_cast <bool> (VT.getVectorNumElements() == InVT. getVectorNumElements() && "Expected same number of elements" ) ? void (0) : __assert_fail ("VT.getVectorNumElements() == InVT.getVectorNumElements() && \"Expected same number of elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26665, __extension__ __PRETTY_FUNCTION__)); | |||
| 26666 | assert((VT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__ __PRETTY_FUNCTION__)) | |||
| 26667 | VT.getVectorElementType() == MVT::i32 ||(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__ __PRETTY_FUNCTION__)) | |||
| 26668 | VT.getVectorElementType() == MVT::i64) &&(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__ __PRETTY_FUNCTION__)) | |||
| 26669 | "Unexpected element type")(static_cast <bool> ((VT.getVectorElementType() == MVT:: i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType () == MVT::i64) && "Unexpected element type") ? void ( 0) : __assert_fail ("(VT.getVectorElementType() == MVT::i16 || VT.getVectorElementType() == MVT::i32 || VT.getVectorElementType() == MVT::i64) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26669, __extension__ __PRETTY_FUNCTION__)); | |||
| 26670 | assert((InVT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__ __PRETTY_FUNCTION__)) | |||
| 26671 | InVT.getVectorElementType() == MVT::i16 ||(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__ __PRETTY_FUNCTION__)) | |||
| 26672 | InVT.getVectorElementType() == MVT::i32) &&(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__ __PRETTY_FUNCTION__)) | |||
| 26673 | "Unexpected element type")(static_cast <bool> ((InVT.getVectorElementType() == MVT ::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType () == MVT::i32) && "Unexpected element type") ? void ( 0) : __assert_fail ("(InVT.getVectorElementType() == MVT::i8 || InVT.getVectorElementType() == MVT::i16 || InVT.getVectorElementType() == MVT::i32) && \"Unexpected element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26673, __extension__ __PRETTY_FUNCTION__)); | |||
| 26674 | ||||
| 26675 | if (VT == MVT::v32i16 && !Subtarget.hasBWI()) { | |||
| 26676 | assert(InVT == MVT::v32i8 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v32i8 && "Unexpected VT!" ) ? void (0) : __assert_fail ("InVT == MVT::v32i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26676, __extension__ __PRETTY_FUNCTION__)); | |||
| 26677 | return splitVectorIntUnary(Op, DAG); | |||
| 26678 | } | |||
| 26679 | ||||
| 26680 | if (Subtarget.hasInt256()) | |||
| 26681 | return Op; | |||
| 26682 | ||||
| 26683 | // Optimize vectors in AVX mode | |||
| 26684 | // Sign extend v8i16 to v8i32 and | |||
| 26685 | // v4i32 to v4i64 | |||
| 26686 | // | |||
| 26687 | // Divide input vector into two parts | |||
| 26688 | // for v4i32 the high shuffle mask will be {2, 3, -1, -1} | |||
| 26689 | // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32 | |||
| 26690 | // concat the vectors to original VT | |||
| 26691 | MVT HalfVT = VT.getHalfNumVectorElementsVT(); | |||
| 26692 | SDValue OpLo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, In); | |||
| 26693 | ||||
| 26694 | unsigned NumElems = InVT.getVectorNumElements(); | |||
| 26695 | SmallVector<int,8> ShufMask(NumElems, -1); | |||
| 26696 | for (unsigned i = 0; i != NumElems/2; ++i) | |||
| 26697 | ShufMask[i] = i + NumElems/2; | |||
| 26698 | ||||
| 26699 | SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); | |||
| 26700 | OpHi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, HalfVT, OpHi); | |||
| 26701 | ||||
| 26702 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); | |||
| 26703 | } | |||
| 26704 | ||||
| 26705 | /// Change a vector store into a pair of half-size vector stores. | |||
| 26706 | static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) { | |||
| 26707 | SDValue StoredVal = Store->getValue(); | |||
| 26708 | assert((StoredVal.getValueType().is256BitVector() ||(static_cast <bool> ((StoredVal.getValueType().is256BitVector () || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op" ) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__ __PRETTY_FUNCTION__)) | |||
| 26709 | StoredVal.getValueType().is512BitVector()) &&(static_cast <bool> ((StoredVal.getValueType().is256BitVector () || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op" ) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__ __PRETTY_FUNCTION__)) | |||
| 26710 | "Expecting 256/512-bit op")(static_cast <bool> ((StoredVal.getValueType().is256BitVector () || StoredVal.getValueType().is512BitVector()) && "Expecting 256/512-bit op" ) ? void (0) : __assert_fail ("(StoredVal.getValueType().is256BitVector() || StoredVal.getValueType().is512BitVector()) && \"Expecting 256/512-bit op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26710, __extension__ __PRETTY_FUNCTION__)); | |||
| 26711 | ||||
| 26712 | // Splitting volatile memory ops is not allowed unless the operation was not | |||
| 26713 | // legal to begin with. Assume the input store is legal (this transform is | |||
| 26714 | // only used for targets with AVX). Note: It is possible that we have an | |||
| 26715 | // illegal type like v2i128, and so we could allow splitting a volatile store | |||
| 26716 | // in that case if that is important. | |||
| 26717 | if (!Store->isSimple()) | |||
| 26718 | return SDValue(); | |||
| 26719 | ||||
| 26720 | SDLoc DL(Store); | |||
| 26721 | SDValue Value0, Value1; | |||
| 26722 | std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL); | |||
| 26723 | unsigned HalfOffset = Value0.getValueType().getStoreSize(); | |||
| 26724 | SDValue Ptr0 = Store->getBasePtr(); | |||
| 26725 | SDValue Ptr1 = | |||
| 26726 | DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(HalfOffset), DL); | |||
| 26727 | SDValue Ch0 = | |||
| 26728 | DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(), | |||
| 26729 | Store->getOriginalAlign(), | |||
| 26730 | Store->getMemOperand()->getFlags()); | |||
| 26731 | SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1, | |||
| 26732 | Store->getPointerInfo().getWithOffset(HalfOffset), | |||
| 26733 | Store->getOriginalAlign(), | |||
| 26734 | Store->getMemOperand()->getFlags()); | |||
| 26735 | return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1); | |||
| 26736 | } | |||
| 26737 | ||||
| 26738 | /// Scalarize a vector store, bitcasting to TargetVT to determine the scalar | |||
| 26739 | /// type. | |||
| 26740 | static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT, | |||
| 26741 | SelectionDAG &DAG) { | |||
| 26742 | SDValue StoredVal = Store->getValue(); | |||
| 26743 | assert(StoreVT.is128BitVector() &&(static_cast <bool> (StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op" ) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__ __PRETTY_FUNCTION__)) | |||
| 26744 | StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op")(static_cast <bool> (StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && "Expecting 128-bit op" ) ? void (0) : __assert_fail ("StoreVT.is128BitVector() && StoredVal.getValueType().is128BitVector() && \"Expecting 128-bit op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26744, __extension__ __PRETTY_FUNCTION__)); | |||
| 26745 | StoredVal = DAG.getBitcast(StoreVT, StoredVal); | |||
| 26746 | ||||
| 26747 | // Splitting volatile memory ops is not allowed unless the operation was not | |||
| 26748 | // legal to begin with. We are assuming the input op is legal (this transform | |||
| 26749 | // is only used for targets with AVX). | |||
| 26750 | if (!Store->isSimple()) | |||
| 26751 | return SDValue(); | |||
| 26752 | ||||
| 26753 | MVT StoreSVT = StoreVT.getScalarType(); | |||
| 26754 | unsigned NumElems = StoreVT.getVectorNumElements(); | |||
| 26755 | unsigned ScalarSize = StoreSVT.getStoreSize(); | |||
| 26756 | ||||
| 26757 | SDLoc DL(Store); | |||
| 26758 | SmallVector<SDValue, 4> Stores; | |||
| 26759 | for (unsigned i = 0; i != NumElems; ++i) { | |||
| 26760 | unsigned Offset = i * ScalarSize; | |||
| 26761 | SDValue Ptr = DAG.getMemBasePlusOffset(Store->getBasePtr(), | |||
| 26762 | TypeSize::Fixed(Offset), DL); | |||
| 26763 | SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreSVT, StoredVal, | |||
| 26764 | DAG.getIntPtrConstant(i, DL)); | |||
| 26765 | SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr, | |||
| 26766 | Store->getPointerInfo().getWithOffset(Offset), | |||
| 26767 | Store->getOriginalAlign(), | |||
| 26768 | Store->getMemOperand()->getFlags()); | |||
| 26769 | Stores.push_back(Ch); | |||
| 26770 | } | |||
| 26771 | return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); | |||
| 26772 | } | |||
| 26773 | ||||
| 26774 | static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget, | |||
| 26775 | SelectionDAG &DAG) { | |||
| 26776 | StoreSDNode *St = cast<StoreSDNode>(Op.getNode()); | |||
| 26777 | SDLoc dl(St); | |||
| 26778 | SDValue StoredVal = St->getValue(); | |||
| 26779 | ||||
| 26780 | // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 stores. | |||
| 26781 | if (StoredVal.getValueType().isVector() && | |||
| 26782 | StoredVal.getValueType().getVectorElementType() == MVT::i1) { | |||
| 26783 | unsigned NumElts = StoredVal.getValueType().getVectorNumElements(); | |||
| 26784 | assert(NumElts <= 8 && "Unexpected VT")(static_cast <bool> (NumElts <= 8 && "Unexpected VT" ) ? void (0) : __assert_fail ("NumElts <= 8 && \"Unexpected VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26784, __extension__ __PRETTY_FUNCTION__)); | |||
| 26785 | assert(!St->isTruncatingStore() && "Expected non-truncating store")(static_cast <bool> (!St->isTruncatingStore() && "Expected non-truncating store") ? void (0) : __assert_fail ( "!St->isTruncatingStore() && \"Expected non-truncating store\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26785, __extension__ __PRETTY_FUNCTION__)); | |||
| 26786 | assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__ __PRETTY_FUNCTION__)) | |||
| 26787 | "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26787, __extension__ __PRETTY_FUNCTION__)); | |||
| 26788 | ||||
| 26789 | // We must pad with zeros to ensure we store zeroes to any unused bits. | |||
| 26790 | StoredVal = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, | |||
| 26791 | DAG.getUNDEF(MVT::v16i1), StoredVal, | |||
| 26792 | DAG.getIntPtrConstant(0, dl)); | |||
| 26793 | StoredVal = DAG.getBitcast(MVT::i16, StoredVal); | |||
| 26794 | StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal); | |||
| 26795 | // Make sure we store zeros in the extra bits. | |||
| 26796 | if (NumElts < 8) | |||
| 26797 | StoredVal = DAG.getZeroExtendInReg( | |||
| 26798 | StoredVal, dl, EVT::getIntegerVT(*DAG.getContext(), NumElts)); | |||
| 26799 | ||||
| 26800 | return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), | |||
| 26801 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 26802 | St->getMemOperand()->getFlags()); | |||
| 26803 | } | |||
| 26804 | ||||
| 26805 | if (St->isTruncatingStore()) | |||
| 26806 | return SDValue(); | |||
| 26807 | ||||
| 26808 | // If this is a 256-bit store of concatenated ops, we are better off splitting | |||
| 26809 | // that store into two 128-bit stores. This avoids spurious use of 256-bit ops | |||
| 26810 | // and each half can execute independently. Some cores would split the op into | |||
| 26811 | // halves anyway, so the concat (vinsertf128) is purely an extra op. | |||
| 26812 | MVT StoreVT = StoredVal.getSimpleValueType(); | |||
| 26813 | if (StoreVT.is256BitVector() || | |||
| 26814 | ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) && | |||
| 26815 | !Subtarget.hasBWI())) { | |||
| 26816 | SmallVector<SDValue, 4> CatOps; | |||
| 26817 | if (StoredVal.hasOneUse() && | |||
| 26818 | collectConcatOps(StoredVal.getNode(), CatOps, DAG)) | |||
| 26819 | return splitVectorStore(St, DAG); | |||
| 26820 | return SDValue(); | |||
| 26821 | } | |||
| 26822 | ||||
| 26823 | if (StoreVT.is32BitVector()) | |||
| 26824 | return SDValue(); | |||
| 26825 | ||||
| 26826 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 26827 | assert(StoreVT.is64BitVector() && "Unexpected VT")(static_cast <bool> (StoreVT.is64BitVector() && "Unexpected VT") ? void (0) : __assert_fail ("StoreVT.is64BitVector() && \"Unexpected VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26827, __extension__ __PRETTY_FUNCTION__)); | |||
| 26828 | assert(TLI.getTypeAction(*DAG.getContext(), StoreVT) ==(static_cast <bool> (TLI.getTypeAction(*DAG.getContext( ), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!" ) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__ __PRETTY_FUNCTION__)) | |||
| 26829 | TargetLowering::TypeWidenVector &&(static_cast <bool> (TLI.getTypeAction(*DAG.getContext( ), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!" ) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__ __PRETTY_FUNCTION__)) | |||
| 26830 | "Unexpected type action!")(static_cast <bool> (TLI.getTypeAction(*DAG.getContext( ), StoreVT) == TargetLowering::TypeWidenVector && "Unexpected type action!" ) ? void (0) : __assert_fail ("TLI.getTypeAction(*DAG.getContext(), StoreVT) == TargetLowering::TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26830, __extension__ __PRETTY_FUNCTION__)); | |||
| 26831 | ||||
| 26832 | EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), StoreVT); | |||
| 26833 | StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, StoredVal, | |||
| 26834 | DAG.getUNDEF(StoreVT)); | |||
| 26835 | ||||
| 26836 | if (Subtarget.hasSSE2()) { | |||
| 26837 | // Widen the vector, cast to a v2x64 type, extract the single 64-bit element | |||
| 26838 | // and store it. | |||
| 26839 | MVT StVT = Subtarget.is64Bit() && StoreVT.isInteger() ? MVT::i64 : MVT::f64; | |||
| 26840 | MVT CastVT = MVT::getVectorVT(StVT, 2); | |||
| 26841 | StoredVal = DAG.getBitcast(CastVT, StoredVal); | |||
| 26842 | StoredVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, StVT, StoredVal, | |||
| 26843 | DAG.getIntPtrConstant(0, dl)); | |||
| 26844 | ||||
| 26845 | return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), | |||
| 26846 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 26847 | St->getMemOperand()->getFlags()); | |||
| 26848 | } | |||
| 26849 | assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE" ) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26849, __extension__ __PRETTY_FUNCTION__)); | |||
| 26850 | SDVTList Tys = DAG.getVTList(MVT::Other); | |||
| 26851 | SDValue Ops[] = {St->getChain(), StoredVal, St->getBasePtr()}; | |||
| 26852 | return DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, MVT::i64, | |||
| 26853 | St->getMemOperand()); | |||
| 26854 | } | |||
| 26855 | ||||
| 26856 | // Lower vector extended loads using a shuffle. If SSSE3 is not available we | |||
| 26857 | // may emit an illegal shuffle but the expansion is still better than scalar | |||
| 26858 | // code. We generate sext/sext_invec for SEXTLOADs if it's available, otherwise | |||
| 26859 | // we'll emit a shuffle and a arithmetic shift. | |||
| 26860 | // FIXME: Is the expansion actually better than scalar code? It doesn't seem so. | |||
| 26861 | // TODO: It is possible to support ZExt by zeroing the undef values during | |||
| 26862 | // the shuffle phase or after the shuffle. | |||
| 26863 | static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget, | |||
| 26864 | SelectionDAG &DAG) { | |||
| 26865 | MVT RegVT = Op.getSimpleValueType(); | |||
| 26866 | assert(RegVT.isVector() && "We only custom lower vector loads.")(static_cast <bool> (RegVT.isVector() && "We only custom lower vector loads." ) ? void (0) : __assert_fail ("RegVT.isVector() && \"We only custom lower vector loads.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26866, __extension__ __PRETTY_FUNCTION__)); | |||
| 26867 | assert(RegVT.isInteger() &&(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads." ) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__ __PRETTY_FUNCTION__)) | |||
| 26868 | "We only custom lower integer vector loads.")(static_cast <bool> (RegVT.isInteger() && "We only custom lower integer vector loads." ) ? void (0) : __assert_fail ("RegVT.isInteger() && \"We only custom lower integer vector loads.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26868, __extension__ __PRETTY_FUNCTION__)); | |||
| 26869 | ||||
| 26870 | LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode()); | |||
| 26871 | SDLoc dl(Ld); | |||
| 26872 | ||||
| 26873 | // Without AVX512DQ, we need to use a scalar type for v2i1/v4i1/v8i1 loads. | |||
| 26874 | if (RegVT.getVectorElementType() == MVT::i1) { | |||
| 26875 | assert(EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load")(static_cast <bool> (EVT(RegVT) == Ld->getMemoryVT() && "Expected non-extending load") ? void (0) : __assert_fail ("EVT(RegVT) == Ld->getMemoryVT() && \"Expected non-extending load\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26875, __extension__ __PRETTY_FUNCTION__)); | |||
| 26876 | assert(RegVT.getVectorNumElements() <= 8 && "Unexpected VT")(static_cast <bool> (RegVT.getVectorNumElements() <= 8 && "Unexpected VT") ? void (0) : __assert_fail ("RegVT.getVectorNumElements() <= 8 && \"Unexpected VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26876, __extension__ __PRETTY_FUNCTION__)); | |||
| 26877 | assert(Subtarget.hasAVX512() && !Subtarget.hasDQI() &&(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__ __PRETTY_FUNCTION__)) | |||
| 26878 | "Expected AVX512F without AVX512DQI")(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasDQI() && "Expected AVX512F without AVX512DQI" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && !Subtarget.hasDQI() && \"Expected AVX512F without AVX512DQI\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26878, __extension__ __PRETTY_FUNCTION__)); | |||
| 26879 | ||||
| 26880 | SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(), | |||
| 26881 | Ld->getPointerInfo(), Ld->getOriginalAlign(), | |||
| 26882 | Ld->getMemOperand()->getFlags()); | |||
| 26883 | ||||
| 26884 | // Replace chain users with the new chain. | |||
| 26885 | assert(NewLd->getNumValues() == 2 && "Loads must carry a chain!")(static_cast <bool> (NewLd->getNumValues() == 2 && "Loads must carry a chain!") ? void (0) : __assert_fail ("NewLd->getNumValues() == 2 && \"Loads must carry a chain!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 26885, __extension__ __PRETTY_FUNCTION__)); | |||
| 26886 | ||||
| 26887 | SDValue Val = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, NewLd); | |||
| 26888 | Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, RegVT, | |||
| 26889 | DAG.getBitcast(MVT::v16i1, Val), | |||
| 26890 | DAG.getIntPtrConstant(0, dl)); | |||
| 26891 | return DAG.getMergeValues({Val, NewLd.getValue(1)}, dl); | |||
| 26892 | } | |||
| 26893 | ||||
| 26894 | return SDValue(); | |||
| 26895 | } | |||
| 26896 | ||||
| 26897 | /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes | |||
| 26898 | /// each of which has no other use apart from the AND / OR. | |||
| 26899 | static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) { | |||
| 26900 | Opc = Op.getOpcode(); | |||
| 26901 | if (Opc != ISD::OR && Opc != ISD::AND) | |||
| 26902 | return false; | |||
| 26903 | return (Op.getOperand(0).getOpcode() == X86ISD::SETCC && | |||
| 26904 | Op.getOperand(0).hasOneUse() && | |||
| 26905 | Op.getOperand(1).getOpcode() == X86ISD::SETCC && | |||
| 26906 | Op.getOperand(1).hasOneUse()); | |||
| 26907 | } | |||
| 26908 | ||||
| 26909 | SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { | |||
| 26910 | SDValue Chain = Op.getOperand(0); | |||
| 26911 | SDValue Cond = Op.getOperand(1); | |||
| 26912 | SDValue Dest = Op.getOperand(2); | |||
| 26913 | SDLoc dl(Op); | |||
| 26914 | ||||
| 26915 | // Bail out when we don't have native compare instructions. | |||
| 26916 | if (Cond.getOpcode() == ISD::SETCC && | |||
| 26917 | Cond.getOperand(0).getValueType() != MVT::f128 && | |||
| 26918 | !isSoftFP16(Cond.getOperand(0).getValueType())) { | |||
| 26919 | SDValue LHS = Cond.getOperand(0); | |||
| 26920 | SDValue RHS = Cond.getOperand(1); | |||
| 26921 | ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); | |||
| 26922 | ||||
| 26923 | // Special case for | |||
| 26924 | // setcc([su]{add,sub,mul}o == 0) | |||
| 26925 | // setcc([su]{add,sub,mul}o != 1) | |||
| 26926 | if (ISD::isOverflowIntrOpRes(LHS) && | |||
| 26927 | (CC == ISD::SETEQ || CC == ISD::SETNE) && | |||
| 26928 | (isNullConstant(RHS) || isOneConstant(RHS))) { | |||
| 26929 | SDValue Value, Overflow; | |||
| 26930 | X86::CondCode X86Cond; | |||
| 26931 | std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG); | |||
| 26932 | ||||
| 26933 | if ((CC == ISD::SETEQ) == isNullConstant(RHS)) | |||
| 26934 | X86Cond = X86::GetOppositeBranchCondition(X86Cond); | |||
| 26935 | ||||
| 26936 | SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); | |||
| 26937 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 26938 | Overflow); | |||
| 26939 | } | |||
| 26940 | ||||
| 26941 | if (LHS.getSimpleValueType().isInteger()) { | |||
| 26942 | SDValue CCVal; | |||
| 26943 | SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal); | |||
| 26944 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 26945 | EFLAGS); | |||
| 26946 | } | |||
| 26947 | ||||
| 26948 | if (CC == ISD::SETOEQ) { | |||
| 26949 | // For FCMP_OEQ, we can emit | |||
| 26950 | // two branches instead of an explicit AND instruction with a | |||
| 26951 | // separate test. However, we only do this if this block doesn't | |||
| 26952 | // have a fall-through edge, because this requires an explicit | |||
| 26953 | // jmp when the condition is false. | |||
| 26954 | if (Op.getNode()->hasOneUse()) { | |||
| 26955 | SDNode *User = *Op.getNode()->use_begin(); | |||
| 26956 | // Look for an unconditional branch following this conditional branch. | |||
| 26957 | // We need this because we need to reverse the successors in order | |||
| 26958 | // to implement FCMP_OEQ. | |||
| 26959 | if (User->getOpcode() == ISD::BR) { | |||
| 26960 | SDValue FalseBB = User->getOperand(1); | |||
| 26961 | SDNode *NewBR = | |||
| 26962 | DAG.UpdateNodeOperands(User, User->getOperand(0), Dest); | |||
| 26963 | assert(NewBR == User)(static_cast <bool> (NewBR == User) ? void (0) : __assert_fail ("NewBR == User", "llvm/lib/Target/X86/X86ISelLowering.cpp", 26963, __extension__ __PRETTY_FUNCTION__)); | |||
| 26964 | (void)NewBR; | |||
| 26965 | Dest = FalseBB; | |||
| 26966 | ||||
| 26967 | SDValue Cmp = | |||
| 26968 | DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); | |||
| 26969 | SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); | |||
| 26970 | Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, | |||
| 26971 | CCVal, Cmp); | |||
| 26972 | CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); | |||
| 26973 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 26974 | Cmp); | |||
| 26975 | } | |||
| 26976 | } | |||
| 26977 | } else if (CC == ISD::SETUNE) { | |||
| 26978 | // For FCMP_UNE, we can emit | |||
| 26979 | // two branches instead of an explicit OR instruction with a | |||
| 26980 | // separate test. | |||
| 26981 | SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); | |||
| 26982 | SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8); | |||
| 26983 | Chain = | |||
| 26984 | DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp); | |||
| 26985 | CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8); | |||
| 26986 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 26987 | Cmp); | |||
| 26988 | } else { | |||
| 26989 | X86::CondCode X86Cond = | |||
| 26990 | TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG); | |||
| 26991 | SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS); | |||
| 26992 | SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); | |||
| 26993 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 26994 | Cmp); | |||
| 26995 | } | |||
| 26996 | } | |||
| 26997 | ||||
| 26998 | if (ISD::isOverflowIntrOpRes(Cond)) { | |||
| 26999 | SDValue Value, Overflow; | |||
| 27000 | X86::CondCode X86Cond; | |||
| 27001 | std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG); | |||
| 27002 | ||||
| 27003 | SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8); | |||
| 27004 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 27005 | Overflow); | |||
| 27006 | } | |||
| 27007 | ||||
| 27008 | // Look past the truncate if the high bits are known zero. | |||
| 27009 | if (isTruncWithZeroHighBitsInput(Cond, DAG)) | |||
| 27010 | Cond = Cond.getOperand(0); | |||
| 27011 | ||||
| 27012 | EVT CondVT = Cond.getValueType(); | |||
| 27013 | ||||
| 27014 | // Add an AND with 1 if we don't already have one. | |||
| 27015 | if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1)))) | |||
| 27016 | Cond = | |||
| 27017 | DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT)); | |||
| 27018 | ||||
| 27019 | SDValue LHS = Cond; | |||
| 27020 | SDValue RHS = DAG.getConstant(0, dl, CondVT); | |||
| 27021 | ||||
| 27022 | SDValue CCVal; | |||
| 27023 | SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal); | |||
| 27024 | return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, | |||
| 27025 | EFLAGS); | |||
| 27026 | } | |||
| 27027 | ||||
| 27028 | // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets. | |||
| 27029 | // Calls to _alloca are needed to probe the stack when allocating more than 4k | |||
| 27030 | // bytes in one go. Touching the stack at 4K increments is necessary to ensure | |||
| 27031 | // that the guard pages used by the OS virtual memory manager are allocated in | |||
| 27032 | // correct sequence. | |||
| 27033 | SDValue | |||
| 27034 | X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, | |||
| 27035 | SelectionDAG &DAG) const { | |||
| 27036 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 27037 | bool SplitStack = MF.shouldSplitStack(); | |||
| 27038 | bool EmitStackProbeCall = hasStackProbeSymbol(MF); | |||
| 27039 | bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) || | |||
| 27040 | SplitStack || EmitStackProbeCall; | |||
| 27041 | SDLoc dl(Op); | |||
| 27042 | ||||
| 27043 | // Get the inputs. | |||
| 27044 | SDNode *Node = Op.getNode(); | |||
| 27045 | SDValue Chain = Op.getOperand(0); | |||
| 27046 | SDValue Size = Op.getOperand(1); | |||
| 27047 | MaybeAlign Alignment(Op.getConstantOperandVal(2)); | |||
| 27048 | EVT VT = Node->getValueType(0); | |||
| 27049 | ||||
| 27050 | // Chain the dynamic stack allocation so that it doesn't modify the stack | |||
| 27051 | // pointer when other instructions are using the stack. | |||
| 27052 | Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); | |||
| 27053 | ||||
| 27054 | bool Is64Bit = Subtarget.is64Bit(); | |||
| 27055 | MVT SPTy = getPointerTy(DAG.getDataLayout()); | |||
| 27056 | ||||
| 27057 | SDValue Result; | |||
| 27058 | if (!Lower) { | |||
| 27059 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 27060 | Register SPReg = TLI.getStackPointerRegisterToSaveRestore(); | |||
| 27061 | assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!") ? void (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__ __PRETTY_FUNCTION__)) | |||
| 27062 | " not tell us which reg is the stack pointer!")(static_cast <bool> (SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and" " not tell us which reg is the stack pointer!") ? void (0) : __assert_fail ("SPReg && \"Target cannot require DYNAMIC_STACKALLOC expansion and\" \" not tell us which reg is the stack pointer!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27062, __extension__ __PRETTY_FUNCTION__)); | |||
| 27063 | ||||
| 27064 | const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); | |||
| 27065 | const Align StackAlign = TFI.getStackAlign(); | |||
| 27066 | if (hasInlineStackProbe(MF)) { | |||
| 27067 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
| 27068 | ||||
| 27069 | const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); | |||
| 27070 | Register Vreg = MRI.createVirtualRegister(AddrRegClass); | |||
| 27071 | Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); | |||
| 27072 | Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain, | |||
| 27073 | DAG.getRegister(Vreg, SPTy)); | |||
| 27074 | } else { | |||
| 27075 | SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); | |||
| 27076 | Chain = SP.getValue(1); | |||
| 27077 | Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value | |||
| 27078 | } | |||
| 27079 | if (Alignment && *Alignment > StackAlign) | |||
| 27080 | Result = | |||
| 27081 | DAG.getNode(ISD::AND, dl, VT, Result, | |||
| 27082 | DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); | |||
| 27083 | Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain | |||
| 27084 | } else if (SplitStack) { | |||
| 27085 | MachineRegisterInfo &MRI = MF.getRegInfo(); | |||
| 27086 | ||||
| 27087 | if (Is64Bit) { | |||
| 27088 | // The 64 bit implementation of segmented stacks needs to clobber both r10 | |||
| 27089 | // r11. This makes it impossible to use it along with nested parameters. | |||
| 27090 | const Function &F = MF.getFunction(); | |||
| 27091 | for (const auto &A : F.args()) { | |||
| 27092 | if (A.hasNestAttr()) | |||
| 27093 | report_fatal_error("Cannot use segmented stacks with functions that " | |||
| 27094 | "have nested arguments."); | |||
| 27095 | } | |||
| 27096 | } | |||
| 27097 | ||||
| 27098 | const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); | |||
| 27099 | Register Vreg = MRI.createVirtualRegister(AddrRegClass); | |||
| 27100 | Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size); | |||
| 27101 | Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain, | |||
| 27102 | DAG.getRegister(Vreg, SPTy)); | |||
| 27103 | } else { | |||
| 27104 | SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 27105 | Chain = DAG.getNode(X86ISD::DYN_ALLOCA, dl, NodeTys, Chain, Size); | |||
| 27106 | MF.getInfo<X86MachineFunctionInfo>()->setHasDynAlloca(true); | |||
| 27107 | ||||
| 27108 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 27109 | Register SPReg = RegInfo->getStackRegister(); | |||
| 27110 | SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy); | |||
| 27111 | Chain = SP.getValue(1); | |||
| 27112 | ||||
| 27113 | if (Alignment) { | |||
| 27114 | SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0), | |||
| 27115 | DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT)); | |||
| 27116 | Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP); | |||
| 27117 | } | |||
| 27118 | ||||
| 27119 | Result = SP; | |||
| 27120 | } | |||
| 27121 | ||||
| 27122 | Chain = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl); | |||
| 27123 | ||||
| 27124 | SDValue Ops[2] = {Result, Chain}; | |||
| 27125 | return DAG.getMergeValues(Ops, dl); | |||
| 27126 | } | |||
| 27127 | ||||
| 27128 | SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { | |||
| 27129 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 27130 | auto PtrVT = getPointerTy(MF.getDataLayout()); | |||
| 27131 | X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 27132 | ||||
| 27133 | const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); | |||
| 27134 | SDLoc DL(Op); | |||
| 27135 | ||||
| 27136 | if (!Subtarget.is64Bit() || | |||
| 27137 | Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { | |||
| 27138 | // vastart just stores the address of the VarArgsFrameIndex slot into the | |||
| 27139 | // memory location argument. | |||
| 27140 | SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); | |||
| 27141 | return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), | |||
| 27142 | MachinePointerInfo(SV)); | |||
| 27143 | } | |||
| 27144 | ||||
| 27145 | // __va_list_tag: | |||
| 27146 | // gp_offset (0 - 6 * 8) | |||
| 27147 | // fp_offset (48 - 48 + 8 * 16) | |||
| 27148 | // overflow_arg_area (point to parameters coming in memory). | |||
| 27149 | // reg_save_area | |||
| 27150 | SmallVector<SDValue, 8> MemOps; | |||
| 27151 | SDValue FIN = Op.getOperand(1); | |||
| 27152 | // Store gp_offset | |||
| 27153 | SDValue Store = DAG.getStore( | |||
| 27154 | Op.getOperand(0), DL, | |||
| 27155 | DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN, | |||
| 27156 | MachinePointerInfo(SV)); | |||
| 27157 | MemOps.push_back(Store); | |||
| 27158 | ||||
| 27159 | // Store fp_offset | |||
| 27160 | FIN = DAG.getMemBasePlusOffset(FIN, TypeSize::Fixed(4), DL); | |||
| 27161 | Store = DAG.getStore( | |||
| 27162 | Op.getOperand(0), DL, | |||
| 27163 | DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN, | |||
| 27164 | MachinePointerInfo(SV, 4)); | |||
| 27165 | MemOps.push_back(Store); | |||
| 27166 | ||||
| 27167 | // Store ptr to overflow_arg_area | |||
| 27168 | FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL)); | |||
| 27169 | SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); | |||
| 27170 | Store = | |||
| 27171 | DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8)); | |||
| 27172 | MemOps.push_back(Store); | |||
| 27173 | ||||
| 27174 | // Store ptr to reg_save_area. | |||
| 27175 | FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant( | |||
| 27176 | Subtarget.isTarget64BitLP64() ? 8 : 4, DL)); | |||
| 27177 | SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT); | |||
| 27178 | Store = DAG.getStore( | |||
| 27179 | Op.getOperand(0), DL, RSFIN, FIN, | |||
| 27180 | MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12)); | |||
| 27181 | MemOps.push_back(Store); | |||
| 27182 | return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps); | |||
| 27183 | } | |||
| 27184 | ||||
| 27185 | SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { | |||
| 27186 | assert(Subtarget.is64Bit() &&(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__ __PRETTY_FUNCTION__)) | |||
| 27187 | "LowerVAARG only handles 64-bit va_arg!")(static_cast <bool> (Subtarget.is64Bit() && "LowerVAARG only handles 64-bit va_arg!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"LowerVAARG only handles 64-bit va_arg!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27187, __extension__ __PRETTY_FUNCTION__)); | |||
| 27188 | assert(Op.getNumOperands() == 4)(static_cast <bool> (Op.getNumOperands() == 4) ? void ( 0) : __assert_fail ("Op.getNumOperands() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 27188, __extension__ __PRETTY_FUNCTION__)); | |||
| 27189 | ||||
| 27190 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 27191 | if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) | |||
| 27192 | // The Win64 ABI uses char* instead of a structure. | |||
| 27193 | return DAG.expandVAArg(Op.getNode()); | |||
| 27194 | ||||
| 27195 | SDValue Chain = Op.getOperand(0); | |||
| 27196 | SDValue SrcPtr = Op.getOperand(1); | |||
| 27197 | const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); | |||
| 27198 | unsigned Align = Op.getConstantOperandVal(3); | |||
| 27199 | SDLoc dl(Op); | |||
| 27200 | ||||
| 27201 | EVT ArgVT = Op.getNode()->getValueType(0); | |||
| 27202 | Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); | |||
| 27203 | uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy); | |||
| 27204 | uint8_t ArgMode; | |||
| 27205 | ||||
| 27206 | // Decide which area this value should be read from. | |||
| 27207 | // TODO: Implement the AMD64 ABI in its entirety. This simple | |||
| 27208 | // selection mechanism works only for the basic types. | |||
| 27209 | assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented")(static_cast <bool> (ArgVT != MVT::f80 && "va_arg for f80 not yet implemented" ) ? void (0) : __assert_fail ("ArgVT != MVT::f80 && \"va_arg for f80 not yet implemented\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27209, __extension__ __PRETTY_FUNCTION__)); | |||
| 27210 | if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) { | |||
| 27211 | ArgMode = 2; // Argument passed in XMM register. Use fp_offset. | |||
| 27212 | } else { | |||
| 27213 | assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&(static_cast <bool> (ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG") ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__ __PRETTY_FUNCTION__)) | |||
| 27214 | "Unhandled argument type in LowerVAARG")(static_cast <bool> (ArgVT.isInteger() && ArgSize <= 32 && "Unhandled argument type in LowerVAARG") ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgSize <= 32 && \"Unhandled argument type in LowerVAARG\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27214, __extension__ __PRETTY_FUNCTION__)); | |||
| 27215 | ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset. | |||
| 27216 | } | |||
| 27217 | ||||
| 27218 | if (ArgMode == 2) { | |||
| 27219 | // Make sure using fp_offset makes sense. | |||
| 27220 | assert(!Subtarget.useSoftFloat() &&(static_cast <bool> (!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat )) && Subtarget.hasSSE1()) ? void (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__ __PRETTY_FUNCTION__)) | |||
| 27221 | !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&(static_cast <bool> (!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat )) && Subtarget.hasSSE1()) ? void (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__ __PRETTY_FUNCTION__)) | |||
| 27222 | Subtarget.hasSSE1())(static_cast <bool> (!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat )) && Subtarget.hasSSE1()) ? void (0) : __assert_fail ("!Subtarget.useSoftFloat() && !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27222, __extension__ __PRETTY_FUNCTION__)); | |||
| 27223 | } | |||
| 27224 | ||||
| 27225 | // Insert VAARG node into the DAG | |||
| 27226 | // VAARG returns two values: Variable Argument Address, Chain | |||
| 27227 | SDValue InstOps[] = {Chain, SrcPtr, | |||
| 27228 | DAG.getTargetConstant(ArgSize, dl, MVT::i32), | |||
| 27229 | DAG.getTargetConstant(ArgMode, dl, MVT::i8), | |||
| 27230 | DAG.getTargetConstant(Align, dl, MVT::i32)}; | |||
| 27231 | SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); | |||
| 27232 | SDValue VAARG = DAG.getMemIntrinsicNode( | |||
| 27233 | Subtarget.isTarget64BitLP64() ? X86ISD::VAARG_64 : X86ISD::VAARG_X32, dl, | |||
| 27234 | VTs, InstOps, MVT::i64, MachinePointerInfo(SV), | |||
| 27235 | /*Alignment=*/std::nullopt, | |||
| 27236 | MachineMemOperand::MOLoad | MachineMemOperand::MOStore); | |||
| 27237 | Chain = VAARG.getValue(1); | |||
| 27238 | ||||
| 27239 | // Load the next argument and return it | |||
| 27240 | return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo()); | |||
| 27241 | } | |||
| 27242 | ||||
| 27243 | static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, | |||
| 27244 | SelectionDAG &DAG) { | |||
| 27245 | // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows, | |||
| 27246 | // where a va_list is still an i8*. | |||
| 27247 | assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!")(static_cast <bool> (Subtarget.is64Bit() && "This code only handles 64-bit va_copy!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"This code only handles 64-bit va_copy!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27247, __extension__ __PRETTY_FUNCTION__)); | |||
| 27248 | if (Subtarget.isCallingConvWin64( | |||
| 27249 | DAG.getMachineFunction().getFunction().getCallingConv())) | |||
| 27250 | // Probably a Win64 va_copy. | |||
| 27251 | return DAG.expandVACopy(Op.getNode()); | |||
| 27252 | ||||
| 27253 | SDValue Chain = Op.getOperand(0); | |||
| 27254 | SDValue DstPtr = Op.getOperand(1); | |||
| 27255 | SDValue SrcPtr = Op.getOperand(2); | |||
| 27256 | const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); | |||
| 27257 | const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); | |||
| 27258 | SDLoc DL(Op); | |||
| 27259 | ||||
| 27260 | return DAG.getMemcpy( | |||
| 27261 | Chain, DL, DstPtr, SrcPtr, | |||
| 27262 | DAG.getIntPtrConstant(Subtarget.isTarget64BitLP64() ? 24 : 16, DL), | |||
| 27263 | Align(Subtarget.isTarget64BitLP64() ? 8 : 4), /*isVolatile*/ false, false, | |||
| 27264 | false, MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV)); | |||
| 27265 | } | |||
| 27266 | ||||
| 27267 | // Helper to get immediate/variable SSE shift opcode from other shift opcodes. | |||
| 27268 | static unsigned getTargetVShiftUniformOpcode(unsigned Opc, bool IsVariable) { | |||
| 27269 | switch (Opc) { | |||
| 27270 | case ISD::SHL: | |||
| 27271 | case X86ISD::VSHL: | |||
| 27272 | case X86ISD::VSHLI: | |||
| 27273 | return IsVariable ? X86ISD::VSHL : X86ISD::VSHLI; | |||
| 27274 | case ISD::SRL: | |||
| 27275 | case X86ISD::VSRL: | |||
| 27276 | case X86ISD::VSRLI: | |||
| 27277 | return IsVariable ? X86ISD::VSRL : X86ISD::VSRLI; | |||
| 27278 | case ISD::SRA: | |||
| 27279 | case X86ISD::VSRA: | |||
| 27280 | case X86ISD::VSRAI: | |||
| 27281 | return IsVariable ? X86ISD::VSRA : X86ISD::VSRAI; | |||
| 27282 | } | |||
| 27283 | llvm_unreachable("Unknown target vector shift node")::llvm::llvm_unreachable_internal("Unknown target vector shift node" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27283); | |||
| 27284 | } | |||
| 27285 | ||||
| 27286 | /// Handle vector element shifts where the shift amount is a constant. | |||
| 27287 | /// Takes immediate version of shift as input. | |||
| 27288 | static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT, | |||
| 27289 | SDValue SrcOp, uint64_t ShiftAmt, | |||
| 27290 | SelectionDAG &DAG) { | |||
| 27291 | MVT ElementType = VT.getVectorElementType(); | |||
| 27292 | ||||
| 27293 | // Bitcast the source vector to the output type, this is mainly necessary for | |||
| 27294 | // vXi8/vXi64 shifts. | |||
| 27295 | if (VT != SrcOp.getSimpleValueType()) | |||
| 27296 | SrcOp = DAG.getBitcast(VT, SrcOp); | |||
| 27297 | ||||
| 27298 | // Fold this packed shift into its first operand if ShiftAmt is 0. | |||
| 27299 | if (ShiftAmt == 0) | |||
| 27300 | return SrcOp; | |||
| 27301 | ||||
| 27302 | // Check for ShiftAmt >= element width | |||
| 27303 | if (ShiftAmt >= ElementType.getSizeInBits()) { | |||
| 27304 | if (Opc == X86ISD::VSRAI) | |||
| 27305 | ShiftAmt = ElementType.getSizeInBits() - 1; | |||
| 27306 | else | |||
| 27307 | return DAG.getConstant(0, dl, VT); | |||
| 27308 | } | |||
| 27309 | ||||
| 27310 | assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD ::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node" ) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__ __PRETTY_FUNCTION__)) | |||
| 27311 | && "Unknown target vector shift-by-constant node")(static_cast <bool> ((Opc == X86ISD::VSHLI || Opc == X86ISD ::VSRLI || Opc == X86ISD::VSRAI) && "Unknown target vector shift-by-constant node" ) ? void (0) : __assert_fail ("(Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI) && \"Unknown target vector shift-by-constant node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27311, __extension__ __PRETTY_FUNCTION__)); | |||
| 27312 | ||||
| 27313 | // Fold this packed vector shift into a build vector if SrcOp is a | |||
| 27314 | // vector of Constants or UNDEFs. | |||
| 27315 | if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) { | |||
| 27316 | unsigned ShiftOpc; | |||
| 27317 | switch (Opc) { | |||
| 27318 | default: llvm_unreachable("Unknown opcode!")::llvm::llvm_unreachable_internal("Unknown opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 27318); | |||
| 27319 | case X86ISD::VSHLI: | |||
| 27320 | ShiftOpc = ISD::SHL; | |||
| 27321 | break; | |||
| 27322 | case X86ISD::VSRLI: | |||
| 27323 | ShiftOpc = ISD::SRL; | |||
| 27324 | break; | |||
| 27325 | case X86ISD::VSRAI: | |||
| 27326 | ShiftOpc = ISD::SRA; | |||
| 27327 | break; | |||
| 27328 | } | |||
| 27329 | ||||
| 27330 | SDValue Amt = DAG.getConstant(ShiftAmt, dl, VT); | |||
| 27331 | if (SDValue C = DAG.FoldConstantArithmetic(ShiftOpc, dl, VT, {SrcOp, Amt})) | |||
| 27332 | return C; | |||
| 27333 | } | |||
| 27334 | ||||
| 27335 | return DAG.getNode(Opc, dl, VT, SrcOp, | |||
| 27336 | DAG.getTargetConstant(ShiftAmt, dl, MVT::i8)); | |||
| 27337 | } | |||
| 27338 | ||||
| 27339 | /// Handle vector element shifts by a splat shift amount | |||
| 27340 | static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, | |||
| 27341 | SDValue SrcOp, SDValue ShAmt, int ShAmtIdx, | |||
| 27342 | const X86Subtarget &Subtarget, | |||
| 27343 | SelectionDAG &DAG) { | |||
| 27344 | MVT AmtVT = ShAmt.getSimpleValueType(); | |||
| 27345 | assert(AmtVT.isVector() && "Vector shift type mismatch")(static_cast <bool> (AmtVT.isVector() && "Vector shift type mismatch" ) ? void (0) : __assert_fail ("AmtVT.isVector() && \"Vector shift type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27345, __extension__ __PRETTY_FUNCTION__)); | |||
| 27346 | assert(0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() &&(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && "Illegal vector splat index" ) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__ __PRETTY_FUNCTION__)) | |||
| 27347 | "Illegal vector splat index")(static_cast <bool> (0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && "Illegal vector splat index" ) ? void (0) : __assert_fail ("0 <= ShAmtIdx && ShAmtIdx < (int)AmtVT.getVectorNumElements() && \"Illegal vector splat index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27347, __extension__ __PRETTY_FUNCTION__)); | |||
| 27348 | ||||
| 27349 | // Move the splat element to the bottom element. | |||
| 27350 | if (ShAmtIdx != 0) { | |||
| 27351 | SmallVector<int> Mask(AmtVT.getVectorNumElements(), -1); | |||
| 27352 | Mask[0] = ShAmtIdx; | |||
| 27353 | ShAmt = DAG.getVectorShuffle(AmtVT, dl, ShAmt, DAG.getUNDEF(AmtVT), Mask); | |||
| 27354 | } | |||
| 27355 | ||||
| 27356 | // Peek through any zext node if we can get back to a 128-bit source. | |||
| 27357 | if (AmtVT.getScalarSizeInBits() == 64 && | |||
| 27358 | (ShAmt.getOpcode() == ISD::ZERO_EXTEND || | |||
| 27359 | ShAmt.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) && | |||
| 27360 | ShAmt.getOperand(0).getValueType().isSimple() && | |||
| 27361 | ShAmt.getOperand(0).getValueType().is128BitVector()) { | |||
| 27362 | ShAmt = ShAmt.getOperand(0); | |||
| 27363 | AmtVT = ShAmt.getSimpleValueType(); | |||
| 27364 | } | |||
| 27365 | ||||
| 27366 | // See if we can mask off the upper elements using the existing source node. | |||
| 27367 | // The shift uses the entire lower 64-bits of the amount vector, so no need to | |||
| 27368 | // do this for vXi64 types. | |||
| 27369 | bool IsMasked = false; | |||
| 27370 | if (AmtVT.getScalarSizeInBits() < 64) { | |||
| 27371 | if (ShAmt.getOpcode() == ISD::BUILD_VECTOR || | |||
| 27372 | ShAmt.getOpcode() == ISD::SCALAR_TO_VECTOR) { | |||
| 27373 | // If the shift amount has come from a scalar, then zero-extend the scalar | |||
| 27374 | // before moving to the vector. | |||
| 27375 | ShAmt = DAG.getZExtOrTrunc(ShAmt.getOperand(0), dl, MVT::i32); | |||
| 27376 | ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); | |||
| 27377 | ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, ShAmt); | |||
| 27378 | AmtVT = MVT::v4i32; | |||
| 27379 | IsMasked = true; | |||
| 27380 | } else if (ShAmt.getOpcode() == ISD::AND) { | |||
| 27381 | // See if the shift amount is already masked (e.g. for rotation modulo), | |||
| 27382 | // then we can zero-extend it by setting all the other mask elements to | |||
| 27383 | // zero. | |||
| 27384 | SmallVector<SDValue> MaskElts( | |||
| 27385 | AmtVT.getVectorNumElements(), | |||
| 27386 | DAG.getConstant(0, dl, AmtVT.getScalarType())); | |||
| 27387 | MaskElts[0] = DAG.getAllOnesConstant(dl, AmtVT.getScalarType()); | |||
| 27388 | SDValue Mask = DAG.getBuildVector(AmtVT, dl, MaskElts); | |||
| 27389 | if ((Mask = DAG.FoldConstantArithmetic(ISD::AND, dl, AmtVT, | |||
| 27390 | {ShAmt.getOperand(1), Mask}))) { | |||
| 27391 | ShAmt = DAG.getNode(ISD::AND, dl, AmtVT, ShAmt.getOperand(0), Mask); | |||
| 27392 | IsMasked = true; | |||
| 27393 | } | |||
| 27394 | } | |||
| 27395 | } | |||
| 27396 | ||||
| 27397 | // Extract if the shift amount vector is larger than 128-bits. | |||
| 27398 | if (AmtVT.getSizeInBits() > 128) { | |||
| 27399 | ShAmt = extract128BitVector(ShAmt, 0, DAG, dl); | |||
| 27400 | AmtVT = ShAmt.getSimpleValueType(); | |||
| 27401 | } | |||
| 27402 | ||||
| 27403 | // Zero-extend bottom element to v2i64 vector type, either by extension or | |||
| 27404 | // shuffle masking. | |||
| 27405 | if (!IsMasked && AmtVT.getScalarSizeInBits() < 64) { | |||
| 27406 | if (AmtVT == MVT::v4i32 && (ShAmt.getOpcode() == X86ISD::VBROADCAST || | |||
| 27407 | ShAmt.getOpcode() == X86ISD::VBROADCAST_LOAD)) { | |||
| 27408 | ShAmt = DAG.getNode(X86ISD::VZEXT_MOVL, SDLoc(ShAmt), MVT::v4i32, ShAmt); | |||
| 27409 | } else if (Subtarget.hasSSE41()) { | |||
| 27410 | ShAmt = DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(ShAmt), | |||
| 27411 | MVT::v2i64, ShAmt); | |||
| 27412 | } else { | |||
| 27413 | SDValue ByteShift = DAG.getTargetConstant( | |||
| 27414 | (128 - AmtVT.getScalarSizeInBits()) / 8, SDLoc(ShAmt), MVT::i8); | |||
| 27415 | ShAmt = DAG.getBitcast(MVT::v16i8, ShAmt); | |||
| 27416 | ShAmt = DAG.getNode(X86ISD::VSHLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, | |||
| 27417 | ByteShift); | |||
| 27418 | ShAmt = DAG.getNode(X86ISD::VSRLDQ, SDLoc(ShAmt), MVT::v16i8, ShAmt, | |||
| 27419 | ByteShift); | |||
| 27420 | } | |||
| 27421 | } | |||
| 27422 | ||||
| 27423 | // Change opcode to non-immediate version. | |||
| 27424 | Opc = getTargetVShiftUniformOpcode(Opc, true); | |||
| 27425 | ||||
| 27426 | // The return type has to be a 128-bit type with the same element | |||
| 27427 | // type as the input type. | |||
| 27428 | MVT EltVT = VT.getVectorElementType(); | |||
| 27429 | MVT ShVT = MVT::getVectorVT(EltVT, 128 / EltVT.getSizeInBits()); | |||
| 27430 | ||||
| 27431 | ShAmt = DAG.getBitcast(ShVT, ShAmt); | |||
| 27432 | return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt); | |||
| 27433 | } | |||
| 27434 | ||||
| 27435 | /// Return Mask with the necessary casting or extending | |||
| 27436 | /// for \p Mask according to \p MaskVT when lowering masking intrinsics | |||
| 27437 | static SDValue getMaskNode(SDValue Mask, MVT MaskVT, | |||
| 27438 | const X86Subtarget &Subtarget, SelectionDAG &DAG, | |||
| 27439 | const SDLoc &dl) { | |||
| 27440 | ||||
| 27441 | if (isAllOnesConstant(Mask)) | |||
| 27442 | return DAG.getConstant(1, dl, MaskVT); | |||
| 27443 | if (X86::isZeroNode(Mask)) | |||
| 27444 | return DAG.getConstant(0, dl, MaskVT); | |||
| 27445 | ||||
| 27446 | assert(MaskVT.bitsLE(Mask.getSimpleValueType()) && "Unexpected mask size!")(static_cast <bool> (MaskVT.bitsLE(Mask.getSimpleValueType ()) && "Unexpected mask size!") ? void (0) : __assert_fail ("MaskVT.bitsLE(Mask.getSimpleValueType()) && \"Unexpected mask size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27446, __extension__ __PRETTY_FUNCTION__)); | |||
| 27447 | ||||
| 27448 | if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) { | |||
| 27449 | assert(MaskVT == MVT::v64i1 && "Expected v64i1 mask!")(static_cast <bool> (MaskVT == MVT::v64i1 && "Expected v64i1 mask!" ) ? void (0) : __assert_fail ("MaskVT == MVT::v64i1 && \"Expected v64i1 mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27449, __extension__ __PRETTY_FUNCTION__)); | |||
| 27450 | assert(Subtarget.hasBWI() && "Expected AVX512BW target!")(static_cast <bool> (Subtarget.hasBWI() && "Expected AVX512BW target!" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected AVX512BW target!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27450, __extension__ __PRETTY_FUNCTION__)); | |||
| 27451 | // In case 32bit mode, bitcast i64 is illegal, extend/split it. | |||
| 27452 | SDValue Lo, Hi; | |||
| 27453 | std::tie(Lo, Hi) = DAG.SplitScalar(Mask, dl, MVT::i32, MVT::i32); | |||
| 27454 | Lo = DAG.getBitcast(MVT::v32i1, Lo); | |||
| 27455 | Hi = DAG.getBitcast(MVT::v32i1, Hi); | |||
| 27456 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); | |||
| 27457 | } else { | |||
| 27458 | MVT BitcastVT = MVT::getVectorVT(MVT::i1, | |||
| 27459 | Mask.getSimpleValueType().getSizeInBits()); | |||
| 27460 | // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements | |||
| 27461 | // are extracted by EXTRACT_SUBVECTOR. | |||
| 27462 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT, | |||
| 27463 | DAG.getBitcast(BitcastVT, Mask), | |||
| 27464 | DAG.getIntPtrConstant(0, dl)); | |||
| 27465 | } | |||
| 27466 | } | |||
| 27467 | ||||
| 27468 | /// Return (and \p Op, \p Mask) for compare instructions or | |||
| 27469 | /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the | |||
| 27470 | /// necessary casting or extending for \p Mask when lowering masking intrinsics | |||
| 27471 | static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, | |||
| 27472 | SDValue PreservedSrc, | |||
| 27473 | const X86Subtarget &Subtarget, | |||
| 27474 | SelectionDAG &DAG) { | |||
| 27475 | MVT VT = Op.getSimpleValueType(); | |||
| 27476 | MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); | |||
| 27477 | unsigned OpcodeSelect = ISD::VSELECT; | |||
| 27478 | SDLoc dl(Op); | |||
| 27479 | ||||
| 27480 | if (isAllOnesConstant(Mask)) | |||
| 27481 | return Op; | |||
| 27482 | ||||
| 27483 | SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 27484 | ||||
| 27485 | if (PreservedSrc.isUndef()) | |||
| 27486 | PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 27487 | return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc); | |||
| 27488 | } | |||
| 27489 | ||||
| 27490 | /// Creates an SDNode for a predicated scalar operation. | |||
| 27491 | /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc). | |||
| 27492 | /// The mask is coming as MVT::i8 and it should be transformed | |||
| 27493 | /// to MVT::v1i1 while lowering masking intrinsics. | |||
| 27494 | /// The main difference between ScalarMaskingNode and VectorMaskingNode is using | |||
| 27495 | /// "X86select" instead of "vselect". We just can't create the "vselect" node | |||
| 27496 | /// for a scalar instruction. | |||
| 27497 | static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, | |||
| 27498 | SDValue PreservedSrc, | |||
| 27499 | const X86Subtarget &Subtarget, | |||
| 27500 | SelectionDAG &DAG) { | |||
| 27501 | ||||
| 27502 | if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask)) | |||
| 27503 | if (MaskConst->getZExtValue() & 0x1) | |||
| 27504 | return Op; | |||
| 27505 | ||||
| 27506 | MVT VT = Op.getSimpleValueType(); | |||
| 27507 | SDLoc dl(Op); | |||
| 27508 | ||||
| 27509 | assert(Mask.getValueType() == MVT::i8 && "Unexpect type")(static_cast <bool> (Mask.getValueType() == MVT::i8 && "Unexpect type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::i8 && \"Unexpect type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27509, __extension__ __PRETTY_FUNCTION__)); | |||
| 27510 | SDValue IMask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v1i1, | |||
| 27511 | DAG.getBitcast(MVT::v8i1, Mask), | |||
| 27512 | DAG.getIntPtrConstant(0, dl)); | |||
| 27513 | if (Op.getOpcode() == X86ISD::FSETCCM || | |||
| 27514 | Op.getOpcode() == X86ISD::FSETCCM_SAE || | |||
| 27515 | Op.getOpcode() == X86ISD::VFPCLASSS) | |||
| 27516 | return DAG.getNode(ISD::AND, dl, VT, Op, IMask); | |||
| 27517 | ||||
| 27518 | if (PreservedSrc.isUndef()) | |||
| 27519 | PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 27520 | return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc); | |||
| 27521 | } | |||
| 27522 | ||||
| 27523 | static int getSEHRegistrationNodeSize(const Function *Fn) { | |||
| 27524 | if (!Fn->hasPersonalityFn()) | |||
| 27525 | report_fatal_error( | |||
| 27526 | "querying registration node size for function without personality"); | |||
| 27527 | // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See | |||
| 27528 | // WinEHStatePass for the full struct definition. | |||
| 27529 | switch (classifyEHPersonality(Fn->getPersonalityFn())) { | |||
| 27530 | case EHPersonality::MSVC_X86SEH: return 24; | |||
| 27531 | case EHPersonality::MSVC_CXX: return 16; | |||
| 27532 | default: break; | |||
| 27533 | } | |||
| 27534 | report_fatal_error( | |||
| 27535 | "can only recover FP for 32-bit MSVC EH personality functions"); | |||
| 27536 | } | |||
| 27537 | ||||
| 27538 | /// When the MSVC runtime transfers control to us, either to an outlined | |||
| 27539 | /// function or when returning to a parent frame after catching an exception, we | |||
| 27540 | /// recover the parent frame pointer by doing arithmetic on the incoming EBP. | |||
| 27541 | /// Here's the math: | |||
| 27542 | /// RegNodeBase = EntryEBP - RegNodeSize | |||
| 27543 | /// ParentFP = RegNodeBase - ParentFrameOffset | |||
| 27544 | /// Subtracting RegNodeSize takes us to the offset of the registration node, and | |||
| 27545 | /// subtracting the offset (negative on x86) takes us back to the parent FP. | |||
| 27546 | static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, | |||
| 27547 | SDValue EntryEBP) { | |||
| 27548 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 27549 | SDLoc dl; | |||
| 27550 | ||||
| 27551 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 27552 | MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 27553 | ||||
| 27554 | // It's possible that the parent function no longer has a personality function | |||
| 27555 | // if the exceptional code was optimized away, in which case we just return | |||
| 27556 | // the incoming EBP. | |||
| 27557 | if (!Fn->hasPersonalityFn()) | |||
| 27558 | return EntryEBP; | |||
| 27559 | ||||
| 27560 | // Get an MCSymbol that will ultimately resolve to the frame offset of the EH | |||
| 27561 | // registration, or the .set_setframe offset. | |||
| 27562 | MCSymbol *OffsetSym = | |||
| 27563 | MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( | |||
| 27564 | GlobalValue::dropLLVMManglingEscape(Fn->getName())); | |||
| 27565 | SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); | |||
| 27566 | SDValue ParentFrameOffset = | |||
| 27567 | DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); | |||
| 27568 | ||||
| 27569 | // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after | |||
| 27570 | // prologue to RBP in the parent function. | |||
| 27571 | const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>(); | |||
| 27572 | if (Subtarget.is64Bit()) | |||
| 27573 | return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset); | |||
| 27574 | ||||
| 27575 | int RegNodeSize = getSEHRegistrationNodeSize(Fn); | |||
| 27576 | // RegNodeBase = EntryEBP - RegNodeSize | |||
| 27577 | // ParentFP = RegNodeBase - ParentFrameOffset | |||
| 27578 | SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP, | |||
| 27579 | DAG.getConstant(RegNodeSize, dl, PtrVT)); | |||
| 27580 | return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); | |||
| 27581 | } | |||
| 27582 | ||||
| 27583 | SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, | |||
| 27584 | SelectionDAG &DAG) const { | |||
| 27585 | // Helper to detect if the operand is CUR_DIRECTION rounding mode. | |||
| 27586 | auto isRoundModeCurDirection = [](SDValue Rnd) { | |||
| 27587 | if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) | |||
| 27588 | return C->getAPIntValue() == X86::STATIC_ROUNDING::CUR_DIRECTION; | |||
| 27589 | ||||
| 27590 | return false; | |||
| 27591 | }; | |||
| 27592 | auto isRoundModeSAE = [](SDValue Rnd) { | |||
| 27593 | if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { | |||
| 27594 | unsigned RC = C->getZExtValue(); | |||
| 27595 | if (RC & X86::STATIC_ROUNDING::NO_EXC) { | |||
| 27596 | // Clear the NO_EXC bit and check remaining bits. | |||
| 27597 | RC ^= X86::STATIC_ROUNDING::NO_EXC; | |||
| 27598 | // As a convenience we allow no other bits or explicitly | |||
| 27599 | // current direction. | |||
| 27600 | return RC == 0 || RC == X86::STATIC_ROUNDING::CUR_DIRECTION; | |||
| 27601 | } | |||
| 27602 | } | |||
| 27603 | ||||
| 27604 | return false; | |||
| 27605 | }; | |||
| 27606 | auto isRoundModeSAEToX = [](SDValue Rnd, unsigned &RC) { | |||
| 27607 | if (auto *C = dyn_cast<ConstantSDNode>(Rnd)) { | |||
| 27608 | RC = C->getZExtValue(); | |||
| 27609 | if (RC & X86::STATIC_ROUNDING::NO_EXC) { | |||
| 27610 | // Clear the NO_EXC bit and check remaining bits. | |||
| 27611 | RC ^= X86::STATIC_ROUNDING::NO_EXC; | |||
| 27612 | return RC == X86::STATIC_ROUNDING::TO_NEAREST_INT || | |||
| 27613 | RC == X86::STATIC_ROUNDING::TO_NEG_INF || | |||
| 27614 | RC == X86::STATIC_ROUNDING::TO_POS_INF || | |||
| 27615 | RC == X86::STATIC_ROUNDING::TO_ZERO; | |||
| 27616 | } | |||
| 27617 | } | |||
| 27618 | ||||
| 27619 | return false; | |||
| 27620 | }; | |||
| 27621 | ||||
| 27622 | SDLoc dl(Op); | |||
| 27623 | unsigned IntNo = Op.getConstantOperandVal(0); | |||
| 27624 | MVT VT = Op.getSimpleValueType(); | |||
| 27625 | const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo); | |||
| 27626 | ||||
| 27627 | // Propagate flags from original node to transformed node(s). | |||
| 27628 | SelectionDAG::FlagInserter FlagsInserter(DAG, Op->getFlags()); | |||
| 27629 | ||||
| 27630 | if (IntrData) { | |||
| 27631 | switch(IntrData->Type) { | |||
| 27632 | case INTR_TYPE_1OP: { | |||
| 27633 | // We specify 2 possible opcodes for intrinsics with rounding modes. | |||
| 27634 | // First, we check if the intrinsic may have non-default rounding mode, | |||
| 27635 | // (IntrData->Opc1 != 0), then we check the rounding mode operand. | |||
| 27636 | unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; | |||
| 27637 | if (IntrWithRoundingModeOpcode != 0) { | |||
| 27638 | SDValue Rnd = Op.getOperand(2); | |||
| 27639 | unsigned RC = 0; | |||
| 27640 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27641 | return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), | |||
| 27642 | Op.getOperand(1), | |||
| 27643 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27644 | if (!isRoundModeCurDirection(Rnd)) | |||
| 27645 | return SDValue(); | |||
| 27646 | } | |||
| 27647 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 27648 | Op.getOperand(1)); | |||
| 27649 | } | |||
| 27650 | case INTR_TYPE_1OP_SAE: { | |||
| 27651 | SDValue Sae = Op.getOperand(2); | |||
| 27652 | ||||
| 27653 | unsigned Opc; | |||
| 27654 | if (isRoundModeCurDirection(Sae)) | |||
| 27655 | Opc = IntrData->Opc0; | |||
| 27656 | else if (isRoundModeSAE(Sae)) | |||
| 27657 | Opc = IntrData->Opc1; | |||
| 27658 | else | |||
| 27659 | return SDValue(); | |||
| 27660 | ||||
| 27661 | return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1)); | |||
| 27662 | } | |||
| 27663 | case INTR_TYPE_2OP: { | |||
| 27664 | SDValue Src2 = Op.getOperand(2); | |||
| 27665 | ||||
| 27666 | // We specify 2 possible opcodes for intrinsics with rounding modes. | |||
| 27667 | // First, we check if the intrinsic may have non-default rounding mode, | |||
| 27668 | // (IntrData->Opc1 != 0), then we check the rounding mode operand. | |||
| 27669 | unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; | |||
| 27670 | if (IntrWithRoundingModeOpcode != 0) { | |||
| 27671 | SDValue Rnd = Op.getOperand(3); | |||
| 27672 | unsigned RC = 0; | |||
| 27673 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27674 | return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), | |||
| 27675 | Op.getOperand(1), Src2, | |||
| 27676 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27677 | if (!isRoundModeCurDirection(Rnd)) | |||
| 27678 | return SDValue(); | |||
| 27679 | } | |||
| 27680 | ||||
| 27681 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 27682 | Op.getOperand(1), Src2); | |||
| 27683 | } | |||
| 27684 | case INTR_TYPE_2OP_SAE: { | |||
| 27685 | SDValue Sae = Op.getOperand(3); | |||
| 27686 | ||||
| 27687 | unsigned Opc; | |||
| 27688 | if (isRoundModeCurDirection(Sae)) | |||
| 27689 | Opc = IntrData->Opc0; | |||
| 27690 | else if (isRoundModeSAE(Sae)) | |||
| 27691 | Opc = IntrData->Opc1; | |||
| 27692 | else | |||
| 27693 | return SDValue(); | |||
| 27694 | ||||
| 27695 | return DAG.getNode(Opc, dl, Op.getValueType(), Op.getOperand(1), | |||
| 27696 | Op.getOperand(2)); | |||
| 27697 | } | |||
| 27698 | case INTR_TYPE_3OP: | |||
| 27699 | case INTR_TYPE_3OP_IMM8: { | |||
| 27700 | SDValue Src1 = Op.getOperand(1); | |||
| 27701 | SDValue Src2 = Op.getOperand(2); | |||
| 27702 | SDValue Src3 = Op.getOperand(3); | |||
| 27703 | ||||
| 27704 | if (IntrData->Type == INTR_TYPE_3OP_IMM8 && | |||
| 27705 | Src3.getValueType() != MVT::i8) { | |||
| 27706 | Src3 = DAG.getTargetConstant( | |||
| 27707 | cast<ConstantSDNode>(Src3)->getZExtValue() & 0xff, dl, MVT::i8); | |||
| 27708 | } | |||
| 27709 | ||||
| 27710 | // We specify 2 possible opcodes for intrinsics with rounding modes. | |||
| 27711 | // First, we check if the intrinsic may have non-default rounding mode, | |||
| 27712 | // (IntrData->Opc1 != 0), then we check the rounding mode operand. | |||
| 27713 | unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; | |||
| 27714 | if (IntrWithRoundingModeOpcode != 0) { | |||
| 27715 | SDValue Rnd = Op.getOperand(4); | |||
| 27716 | unsigned RC = 0; | |||
| 27717 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27718 | return DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), | |||
| 27719 | Src1, Src2, Src3, | |||
| 27720 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27721 | if (!isRoundModeCurDirection(Rnd)) | |||
| 27722 | return SDValue(); | |||
| 27723 | } | |||
| 27724 | ||||
| 27725 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 27726 | {Src1, Src2, Src3}); | |||
| 27727 | } | |||
| 27728 | case INTR_TYPE_4OP_IMM8: { | |||
| 27729 | assert(Op.getOperand(4)->getOpcode() == ISD::TargetConstant)(static_cast <bool> (Op.getOperand(4)->getOpcode() == ISD::TargetConstant) ? void (0) : __assert_fail ("Op.getOperand(4)->getOpcode() == ISD::TargetConstant" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27729, __extension__ __PRETTY_FUNCTION__)); | |||
| 27730 | SDValue Src4 = Op.getOperand(4); | |||
| 27731 | if (Src4.getValueType() != MVT::i8) { | |||
| 27732 | Src4 = DAG.getTargetConstant( | |||
| 27733 | cast<ConstantSDNode>(Src4)->getZExtValue() & 0xff, dl, MVT::i8); | |||
| 27734 | } | |||
| 27735 | ||||
| 27736 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 27737 | Op.getOperand(1), Op.getOperand(2), Op.getOperand(3), | |||
| 27738 | Src4); | |||
| 27739 | } | |||
| 27740 | case INTR_TYPE_1OP_MASK: { | |||
| 27741 | SDValue Src = Op.getOperand(1); | |||
| 27742 | SDValue PassThru = Op.getOperand(2); | |||
| 27743 | SDValue Mask = Op.getOperand(3); | |||
| 27744 | // We add rounding mode to the Node when | |||
| 27745 | // - RC Opcode is specified and | |||
| 27746 | // - RC is not "current direction". | |||
| 27747 | unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; | |||
| 27748 | if (IntrWithRoundingModeOpcode != 0) { | |||
| 27749 | SDValue Rnd = Op.getOperand(4); | |||
| 27750 | unsigned RC = 0; | |||
| 27751 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27752 | return getVectorMaskingNode( | |||
| 27753 | DAG.getNode(IntrWithRoundingModeOpcode, dl, Op.getValueType(), | |||
| 27754 | Src, DAG.getTargetConstant(RC, dl, MVT::i32)), | |||
| 27755 | Mask, PassThru, Subtarget, DAG); | |||
| 27756 | if (!isRoundModeCurDirection(Rnd)) | |||
| 27757 | return SDValue(); | |||
| 27758 | } | |||
| 27759 | return getVectorMaskingNode( | |||
| 27760 | DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru, | |||
| 27761 | Subtarget, DAG); | |||
| 27762 | } | |||
| 27763 | case INTR_TYPE_1OP_MASK_SAE: { | |||
| 27764 | SDValue Src = Op.getOperand(1); | |||
| 27765 | SDValue PassThru = Op.getOperand(2); | |||
| 27766 | SDValue Mask = Op.getOperand(3); | |||
| 27767 | SDValue Rnd = Op.getOperand(4); | |||
| 27768 | ||||
| 27769 | unsigned Opc; | |||
| 27770 | if (isRoundModeCurDirection(Rnd)) | |||
| 27771 | Opc = IntrData->Opc0; | |||
| 27772 | else if (isRoundModeSAE(Rnd)) | |||
| 27773 | Opc = IntrData->Opc1; | |||
| 27774 | else | |||
| 27775 | return SDValue(); | |||
| 27776 | ||||
| 27777 | return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru, | |||
| 27778 | Subtarget, DAG); | |||
| 27779 | } | |||
| 27780 | case INTR_TYPE_SCALAR_MASK: { | |||
| 27781 | SDValue Src1 = Op.getOperand(1); | |||
| 27782 | SDValue Src2 = Op.getOperand(2); | |||
| 27783 | SDValue passThru = Op.getOperand(3); | |||
| 27784 | SDValue Mask = Op.getOperand(4); | |||
| 27785 | unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; | |||
| 27786 | // There are 2 kinds of intrinsics in this group: | |||
| 27787 | // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands | |||
| 27788 | // (2) With rounding mode and sae - 7 operands. | |||
| 27789 | bool HasRounding = IntrWithRoundingModeOpcode != 0; | |||
| 27790 | if (Op.getNumOperands() == (5U + HasRounding)) { | |||
| 27791 | if (HasRounding) { | |||
| 27792 | SDValue Rnd = Op.getOperand(5); | |||
| 27793 | unsigned RC = 0; | |||
| 27794 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27795 | return getScalarMaskingNode( | |||
| 27796 | DAG.getNode(IntrWithRoundingModeOpcode, dl, VT, Src1, Src2, | |||
| 27797 | DAG.getTargetConstant(RC, dl, MVT::i32)), | |||
| 27798 | Mask, passThru, Subtarget, DAG); | |||
| 27799 | if (!isRoundModeCurDirection(Rnd)) | |||
| 27800 | return SDValue(); | |||
| 27801 | } | |||
| 27802 | return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, | |||
| 27803 | Src2), | |||
| 27804 | Mask, passThru, Subtarget, DAG); | |||
| 27805 | } | |||
| 27806 | ||||
| 27807 | assert(Op.getNumOperands() == (6U + HasRounding) &&(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding ) && "Unexpected intrinsic form") ? void (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__ __PRETTY_FUNCTION__)) | |||
| 27808 | "Unexpected intrinsic form")(static_cast <bool> (Op.getNumOperands() == (6U + HasRounding ) && "Unexpected intrinsic form") ? void (0) : __assert_fail ("Op.getNumOperands() == (6U + HasRounding) && \"Unexpected intrinsic form\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 27808, __extension__ __PRETTY_FUNCTION__)); | |||
| 27809 | SDValue RoundingMode = Op.getOperand(5); | |||
| 27810 | unsigned Opc = IntrData->Opc0; | |||
| 27811 | if (HasRounding) { | |||
| 27812 | SDValue Sae = Op.getOperand(6); | |||
| 27813 | if (isRoundModeSAE(Sae)) | |||
| 27814 | Opc = IntrWithRoundingModeOpcode; | |||
| 27815 | else if (!isRoundModeCurDirection(Sae)) | |||
| 27816 | return SDValue(); | |||
| 27817 | } | |||
| 27818 | return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, | |||
| 27819 | Src2, RoundingMode), | |||
| 27820 | Mask, passThru, Subtarget, DAG); | |||
| 27821 | } | |||
| 27822 | case INTR_TYPE_SCALAR_MASK_RND: { | |||
| 27823 | SDValue Src1 = Op.getOperand(1); | |||
| 27824 | SDValue Src2 = Op.getOperand(2); | |||
| 27825 | SDValue passThru = Op.getOperand(3); | |||
| 27826 | SDValue Mask = Op.getOperand(4); | |||
| 27827 | SDValue Rnd = Op.getOperand(5); | |||
| 27828 | ||||
| 27829 | SDValue NewOp; | |||
| 27830 | unsigned RC = 0; | |||
| 27831 | if (isRoundModeCurDirection(Rnd)) | |||
| 27832 | NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); | |||
| 27833 | else if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27834 | NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, | |||
| 27835 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27836 | else | |||
| 27837 | return SDValue(); | |||
| 27838 | ||||
| 27839 | return getScalarMaskingNode(NewOp, Mask, passThru, Subtarget, DAG); | |||
| 27840 | } | |||
| 27841 | case INTR_TYPE_SCALAR_MASK_SAE: { | |||
| 27842 | SDValue Src1 = Op.getOperand(1); | |||
| 27843 | SDValue Src2 = Op.getOperand(2); | |||
| 27844 | SDValue passThru = Op.getOperand(3); | |||
| 27845 | SDValue Mask = Op.getOperand(4); | |||
| 27846 | SDValue Sae = Op.getOperand(5); | |||
| 27847 | unsigned Opc; | |||
| 27848 | if (isRoundModeCurDirection(Sae)) | |||
| 27849 | Opc = IntrData->Opc0; | |||
| 27850 | else if (isRoundModeSAE(Sae)) | |||
| 27851 | Opc = IntrData->Opc1; | |||
| 27852 | else | |||
| 27853 | return SDValue(); | |||
| 27854 | ||||
| 27855 | return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), | |||
| 27856 | Mask, passThru, Subtarget, DAG); | |||
| 27857 | } | |||
| 27858 | case INTR_TYPE_2OP_MASK: { | |||
| 27859 | SDValue Src1 = Op.getOperand(1); | |||
| 27860 | SDValue Src2 = Op.getOperand(2); | |||
| 27861 | SDValue PassThru = Op.getOperand(3); | |||
| 27862 | SDValue Mask = Op.getOperand(4); | |||
| 27863 | SDValue NewOp; | |||
| 27864 | if (IntrData->Opc1 != 0) { | |||
| 27865 | SDValue Rnd = Op.getOperand(5); | |||
| 27866 | unsigned RC = 0; | |||
| 27867 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27868 | NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, | |||
| 27869 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27870 | else if (!isRoundModeCurDirection(Rnd)) | |||
| 27871 | return SDValue(); | |||
| 27872 | } | |||
| 27873 | if (!NewOp) | |||
| 27874 | NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2); | |||
| 27875 | return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); | |||
| 27876 | } | |||
| 27877 | case INTR_TYPE_2OP_MASK_SAE: { | |||
| 27878 | SDValue Src1 = Op.getOperand(1); | |||
| 27879 | SDValue Src2 = Op.getOperand(2); | |||
| 27880 | SDValue PassThru = Op.getOperand(3); | |||
| 27881 | SDValue Mask = Op.getOperand(4); | |||
| 27882 | ||||
| 27883 | unsigned Opc = IntrData->Opc0; | |||
| 27884 | if (IntrData->Opc1 != 0) { | |||
| 27885 | SDValue Sae = Op.getOperand(5); | |||
| 27886 | if (isRoundModeSAE(Sae)) | |||
| 27887 | Opc = IntrData->Opc1; | |||
| 27888 | else if (!isRoundModeCurDirection(Sae)) | |||
| 27889 | return SDValue(); | |||
| 27890 | } | |||
| 27891 | ||||
| 27892 | return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2), | |||
| 27893 | Mask, PassThru, Subtarget, DAG); | |||
| 27894 | } | |||
| 27895 | case INTR_TYPE_3OP_SCALAR_MASK_SAE: { | |||
| 27896 | SDValue Src1 = Op.getOperand(1); | |||
| 27897 | SDValue Src2 = Op.getOperand(2); | |||
| 27898 | SDValue Src3 = Op.getOperand(3); | |||
| 27899 | SDValue PassThru = Op.getOperand(4); | |||
| 27900 | SDValue Mask = Op.getOperand(5); | |||
| 27901 | SDValue Sae = Op.getOperand(6); | |||
| 27902 | unsigned Opc; | |||
| 27903 | if (isRoundModeCurDirection(Sae)) | |||
| 27904 | Opc = IntrData->Opc0; | |||
| 27905 | else if (isRoundModeSAE(Sae)) | |||
| 27906 | Opc = IntrData->Opc1; | |||
| 27907 | else | |||
| 27908 | return SDValue(); | |||
| 27909 | ||||
| 27910 | return getScalarMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), | |||
| 27911 | Mask, PassThru, Subtarget, DAG); | |||
| 27912 | } | |||
| 27913 | case INTR_TYPE_3OP_MASK_SAE: { | |||
| 27914 | SDValue Src1 = Op.getOperand(1); | |||
| 27915 | SDValue Src2 = Op.getOperand(2); | |||
| 27916 | SDValue Src3 = Op.getOperand(3); | |||
| 27917 | SDValue PassThru = Op.getOperand(4); | |||
| 27918 | SDValue Mask = Op.getOperand(5); | |||
| 27919 | ||||
| 27920 | unsigned Opc = IntrData->Opc0; | |||
| 27921 | if (IntrData->Opc1 != 0) { | |||
| 27922 | SDValue Sae = Op.getOperand(6); | |||
| 27923 | if (isRoundModeSAE(Sae)) | |||
| 27924 | Opc = IntrData->Opc1; | |||
| 27925 | else if (!isRoundModeCurDirection(Sae)) | |||
| 27926 | return SDValue(); | |||
| 27927 | } | |||
| 27928 | return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src1, Src2, Src3), | |||
| 27929 | Mask, PassThru, Subtarget, DAG); | |||
| 27930 | } | |||
| 27931 | case BLENDV: { | |||
| 27932 | SDValue Src1 = Op.getOperand(1); | |||
| 27933 | SDValue Src2 = Op.getOperand(2); | |||
| 27934 | SDValue Src3 = Op.getOperand(3); | |||
| 27935 | ||||
| 27936 | EVT MaskVT = Src3.getValueType().changeVectorElementTypeToInteger(); | |||
| 27937 | Src3 = DAG.getBitcast(MaskVT, Src3); | |||
| 27938 | ||||
| 27939 | // Reverse the operands to match VSELECT order. | |||
| 27940 | return DAG.getNode(IntrData->Opc0, dl, VT, Src3, Src2, Src1); | |||
| 27941 | } | |||
| 27942 | case VPERM_2OP : { | |||
| 27943 | SDValue Src1 = Op.getOperand(1); | |||
| 27944 | SDValue Src2 = Op.getOperand(2); | |||
| 27945 | ||||
| 27946 | // Swap Src1 and Src2 in the node creation | |||
| 27947 | return DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1); | |||
| 27948 | } | |||
| 27949 | case CFMA_OP_MASKZ: | |||
| 27950 | case CFMA_OP_MASK: { | |||
| 27951 | SDValue Src1 = Op.getOperand(1); | |||
| 27952 | SDValue Src2 = Op.getOperand(2); | |||
| 27953 | SDValue Src3 = Op.getOperand(3); | |||
| 27954 | SDValue Mask = Op.getOperand(4); | |||
| 27955 | MVT VT = Op.getSimpleValueType(); | |||
| 27956 | ||||
| 27957 | SDValue PassThru = Src3; | |||
| 27958 | if (IntrData->Type == CFMA_OP_MASKZ) | |||
| 27959 | PassThru = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 27960 | ||||
| 27961 | // We add rounding mode to the Node when | |||
| 27962 | // - RC Opcode is specified and | |||
| 27963 | // - RC is not "current direction". | |||
| 27964 | SDValue NewOp; | |||
| 27965 | if (IntrData->Opc1 != 0) { | |||
| 27966 | SDValue Rnd = Op.getOperand(5); | |||
| 27967 | unsigned RC = 0; | |||
| 27968 | if (isRoundModeSAEToX(Rnd, RC)) | |||
| 27969 | NewOp = DAG.getNode(IntrData->Opc1, dl, VT, Src1, Src2, Src3, | |||
| 27970 | DAG.getTargetConstant(RC, dl, MVT::i32)); | |||
| 27971 | else if (!isRoundModeCurDirection(Rnd)) | |||
| 27972 | return SDValue(); | |||
| 27973 | } | |||
| 27974 | if (!NewOp) | |||
| 27975 | NewOp = DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2, Src3); | |||
| 27976 | return getVectorMaskingNode(NewOp, Mask, PassThru, Subtarget, DAG); | |||
| 27977 | } | |||
| 27978 | case IFMA_OP: | |||
| 27979 | // NOTE: We need to swizzle the operands to pass the multiply operands | |||
| 27980 | // first. | |||
| 27981 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 27982 | Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); | |||
| 27983 | case FPCLASSS: { | |||
| 27984 | SDValue Src1 = Op.getOperand(1); | |||
| 27985 | SDValue Imm = Op.getOperand(2); | |||
| 27986 | SDValue Mask = Op.getOperand(3); | |||
| 27987 | SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); | |||
| 27988 | SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), | |||
| 27989 | Subtarget, DAG); | |||
| 27990 | // Need to fill with zeros to ensure the bitcast will produce zeroes | |||
| 27991 | // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. | |||
| 27992 | SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, | |||
| 27993 | DAG.getConstant(0, dl, MVT::v8i1), | |||
| 27994 | FPclassMask, DAG.getIntPtrConstant(0, dl)); | |||
| 27995 | return DAG.getBitcast(MVT::i8, Ins); | |||
| 27996 | } | |||
| 27997 | ||||
| 27998 | case CMP_MASK_CC: { | |||
| 27999 | MVT MaskVT = Op.getSimpleValueType(); | |||
| 28000 | SDValue CC = Op.getOperand(3); | |||
| 28001 | SDValue Mask = Op.getOperand(4); | |||
| 28002 | // We specify 2 possible opcodes for intrinsics with rounding modes. | |||
| 28003 | // First, we check if the intrinsic may have non-default rounding mode, | |||
| 28004 | // (IntrData->Opc1 != 0), then we check the rounding mode operand. | |||
| 28005 | if (IntrData->Opc1 != 0) { | |||
| 28006 | SDValue Sae = Op.getOperand(5); | |||
| 28007 | if (isRoundModeSAE(Sae)) | |||
| 28008 | return DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1), | |||
| 28009 | Op.getOperand(2), CC, Mask, Sae); | |||
| 28010 | if (!isRoundModeCurDirection(Sae)) | |||
| 28011 | return SDValue(); | |||
| 28012 | } | |||
| 28013 | //default rounding mode | |||
| 28014 | return DAG.getNode(IntrData->Opc0, dl, MaskVT, | |||
| 28015 | {Op.getOperand(1), Op.getOperand(2), CC, Mask}); | |||
| 28016 | } | |||
| 28017 | case CMP_MASK_SCALAR_CC: { | |||
| 28018 | SDValue Src1 = Op.getOperand(1); | |||
| 28019 | SDValue Src2 = Op.getOperand(2); | |||
| 28020 | SDValue CC = Op.getOperand(3); | |||
| 28021 | SDValue Mask = Op.getOperand(4); | |||
| 28022 | ||||
| 28023 | SDValue Cmp; | |||
| 28024 | if (IntrData->Opc1 != 0) { | |||
| 28025 | SDValue Sae = Op.getOperand(5); | |||
| 28026 | if (isRoundModeSAE(Sae)) | |||
| 28027 | Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Sae); | |||
| 28028 | else if (!isRoundModeCurDirection(Sae)) | |||
| 28029 | return SDValue(); | |||
| 28030 | } | |||
| 28031 | //default rounding mode | |||
| 28032 | if (!Cmp.getNode()) | |||
| 28033 | Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); | |||
| 28034 | ||||
| 28035 | SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), | |||
| 28036 | Subtarget, DAG); | |||
| 28037 | // Need to fill with zeros to ensure the bitcast will produce zeroes | |||
| 28038 | // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. | |||
| 28039 | SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1, | |||
| 28040 | DAG.getConstant(0, dl, MVT::v8i1), | |||
| 28041 | CmpMask, DAG.getIntPtrConstant(0, dl)); | |||
| 28042 | return DAG.getBitcast(MVT::i8, Ins); | |||
| 28043 | } | |||
| 28044 | case COMI: { // Comparison intrinsics | |||
| 28045 | ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1; | |||
| 28046 | SDValue LHS = Op.getOperand(1); | |||
| 28047 | SDValue RHS = Op.getOperand(2); | |||
| 28048 | // Some conditions require the operands to be swapped. | |||
| 28049 | if (CC == ISD::SETLT || CC == ISD::SETLE) | |||
| 28050 | std::swap(LHS, RHS); | |||
| 28051 | ||||
| 28052 | SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS); | |||
| 28053 | SDValue SetCC; | |||
| 28054 | switch (CC) { | |||
| 28055 | case ISD::SETEQ: { // (ZF = 0 and PF = 0) | |||
| 28056 | SetCC = getSETCC(X86::COND_E, Comi, dl, DAG); | |||
| 28057 | SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG); | |||
| 28058 | SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP); | |||
| 28059 | break; | |||
| 28060 | } | |||
| 28061 | case ISD::SETNE: { // (ZF = 1 or PF = 1) | |||
| 28062 | SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG); | |||
| 28063 | SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG); | |||
| 28064 | SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP); | |||
| 28065 | break; | |||
| 28066 | } | |||
| 28067 | case ISD::SETGT: // (CF = 0 and ZF = 0) | |||
| 28068 | case ISD::SETLT: { // Condition opposite to GT. Operands swapped above. | |||
| 28069 | SetCC = getSETCC(X86::COND_A, Comi, dl, DAG); | |||
| 28070 | break; | |||
| 28071 | } | |||
| 28072 | case ISD::SETGE: // CF = 0 | |||
| 28073 | case ISD::SETLE: // Condition opposite to GE. Operands swapped above. | |||
| 28074 | SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG); | |||
| 28075 | break; | |||
| 28076 | default: | |||
| 28077 | llvm_unreachable("Unexpected illegal condition!")::llvm::llvm_unreachable_internal("Unexpected illegal condition!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28077); | |||
| 28078 | } | |||
| 28079 | return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); | |||
| 28080 | } | |||
| 28081 | case COMI_RM: { // Comparison intrinsics with Sae | |||
| 28082 | SDValue LHS = Op.getOperand(1); | |||
| 28083 | SDValue RHS = Op.getOperand(2); | |||
| 28084 | unsigned CondVal = Op.getConstantOperandVal(3); | |||
| 28085 | SDValue Sae = Op.getOperand(4); | |||
| 28086 | ||||
| 28087 | SDValue FCmp; | |||
| 28088 | if (isRoundModeCurDirection(Sae)) | |||
| 28089 | FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS, | |||
| 28090 | DAG.getTargetConstant(CondVal, dl, MVT::i8)); | |||
| 28091 | else if (isRoundModeSAE(Sae)) | |||
| 28092 | FCmp = DAG.getNode(X86ISD::FSETCCM_SAE, dl, MVT::v1i1, LHS, RHS, | |||
| 28093 | DAG.getTargetConstant(CondVal, dl, MVT::i8), Sae); | |||
| 28094 | else | |||
| 28095 | return SDValue(); | |||
| 28096 | // Need to fill with zeros to ensure the bitcast will produce zeroes | |||
| 28097 | // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. | |||
| 28098 | SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, | |||
| 28099 | DAG.getConstant(0, dl, MVT::v16i1), | |||
| 28100 | FCmp, DAG.getIntPtrConstant(0, dl)); | |||
| 28101 | return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, | |||
| 28102 | DAG.getBitcast(MVT::i16, Ins)); | |||
| 28103 | } | |||
| 28104 | case VSHIFT: { | |||
| 28105 | SDValue SrcOp = Op.getOperand(1); | |||
| 28106 | SDValue ShAmt = Op.getOperand(2); | |||
| 28107 | assert(ShAmt.getValueType() == MVT::i32 &&(static_cast <bool> (ShAmt.getValueType() == MVT::i32 && "Unexpected VSHIFT amount type") ? void (0) : __assert_fail ( "ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__ __PRETTY_FUNCTION__)) | |||
| 28108 | "Unexpected VSHIFT amount type")(static_cast <bool> (ShAmt.getValueType() == MVT::i32 && "Unexpected VSHIFT amount type") ? void (0) : __assert_fail ( "ShAmt.getValueType() == MVT::i32 && \"Unexpected VSHIFT amount type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28108, __extension__ __PRETTY_FUNCTION__)); | |||
| 28109 | ||||
| 28110 | // Catch shift-by-constant. | |||
| 28111 | if (auto *CShAmt = dyn_cast<ConstantSDNode>(ShAmt)) | |||
| 28112 | return getTargetVShiftByConstNode(IntrData->Opc0, dl, | |||
| 28113 | Op.getSimpleValueType(), SrcOp, | |||
| 28114 | CShAmt->getZExtValue(), DAG); | |||
| 28115 | ||||
| 28116 | ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, ShAmt); | |||
| 28117 | return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(), | |||
| 28118 | SrcOp, ShAmt, 0, Subtarget, DAG); | |||
| 28119 | } | |||
| 28120 | case COMPRESS_EXPAND_IN_REG: { | |||
| 28121 | SDValue Mask = Op.getOperand(3); | |||
| 28122 | SDValue DataToCompress = Op.getOperand(1); | |||
| 28123 | SDValue PassThru = Op.getOperand(2); | |||
| 28124 | if (ISD::isBuildVectorAllOnes(Mask.getNode())) // return data as is | |||
| 28125 | return Op.getOperand(1); | |||
| 28126 | ||||
| 28127 | // Avoid false dependency. | |||
| 28128 | if (PassThru.isUndef()) | |||
| 28129 | PassThru = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 28130 | ||||
| 28131 | return DAG.getNode(IntrData->Opc0, dl, VT, DataToCompress, PassThru, | |||
| 28132 | Mask); | |||
| 28133 | } | |||
| 28134 | case FIXUPIMM: | |||
| 28135 | case FIXUPIMM_MASKZ: { | |||
| 28136 | SDValue Src1 = Op.getOperand(1); | |||
| 28137 | SDValue Src2 = Op.getOperand(2); | |||
| 28138 | SDValue Src3 = Op.getOperand(3); | |||
| 28139 | SDValue Imm = Op.getOperand(4); | |||
| 28140 | SDValue Mask = Op.getOperand(5); | |||
| 28141 | SDValue Passthru = (IntrData->Type == FIXUPIMM) | |||
| 28142 | ? Src1 | |||
| 28143 | : getZeroVector(VT, Subtarget, DAG, dl); | |||
| 28144 | ||||
| 28145 | unsigned Opc = IntrData->Opc0; | |||
| 28146 | if (IntrData->Opc1 != 0) { | |||
| 28147 | SDValue Sae = Op.getOperand(6); | |||
| 28148 | if (isRoundModeSAE(Sae)) | |||
| 28149 | Opc = IntrData->Opc1; | |||
| 28150 | else if (!isRoundModeCurDirection(Sae)) | |||
| 28151 | return SDValue(); | |||
| 28152 | } | |||
| 28153 | ||||
| 28154 | SDValue FixupImm = DAG.getNode(Opc, dl, VT, Src1, Src2, Src3, Imm); | |||
| 28155 | ||||
| 28156 | if (Opc == X86ISD::VFIXUPIMM || Opc == X86ISD::VFIXUPIMM_SAE) | |||
| 28157 | return getVectorMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); | |||
| 28158 | ||||
| 28159 | return getScalarMaskingNode(FixupImm, Mask, Passthru, Subtarget, DAG); | |||
| 28160 | } | |||
| 28161 | case ROUNDP: { | |||
| 28162 | assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode") ? void (0) : __assert_fail ( "IntrData->Opc0 == X86ISD::VRNDSCALE && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28162, __extension__ __PRETTY_FUNCTION__)); | |||
| 28163 | // Clear the upper bits of the rounding immediate so that the legacy | |||
| 28164 | // intrinsic can't trigger the scaling behavior of VRNDSCALE. | |||
| 28165 | auto Round = cast<ConstantSDNode>(Op.getOperand(2)); | |||
| 28166 | SDValue RoundingMode = | |||
| 28167 | DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); | |||
| 28168 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 28169 | Op.getOperand(1), RoundingMode); | |||
| 28170 | } | |||
| 28171 | case ROUNDS: { | |||
| 28172 | assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode") ? void (0) : __assert_fail ( "IntrData->Opc0 == X86ISD::VRNDSCALES && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28172, __extension__ __PRETTY_FUNCTION__)); | |||
| 28173 | // Clear the upper bits of the rounding immediate so that the legacy | |||
| 28174 | // intrinsic can't trigger the scaling behavior of VRNDSCALE. | |||
| 28175 | auto Round = cast<ConstantSDNode>(Op.getOperand(3)); | |||
| 28176 | SDValue RoundingMode = | |||
| 28177 | DAG.getTargetConstant(Round->getZExtValue() & 0xf, dl, MVT::i32); | |||
| 28178 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 28179 | Op.getOperand(1), Op.getOperand(2), RoundingMode); | |||
| 28180 | } | |||
| 28181 | case BEXTRI: { | |||
| 28182 | assert(IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode")(static_cast <bool> (IntrData->Opc0 == X86ISD::BEXTRI && "Unexpected opcode") ? void (0) : __assert_fail ( "IntrData->Opc0 == X86ISD::BEXTRI && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28182, __extension__ __PRETTY_FUNCTION__)); | |||
| 28183 | ||||
| 28184 | uint64_t Imm = Op.getConstantOperandVal(2); | |||
| 28185 | SDValue Control = DAG.getTargetConstant(Imm & 0xffff, dl, | |||
| 28186 | Op.getValueType()); | |||
| 28187 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), | |||
| 28188 | Op.getOperand(1), Control); | |||
| 28189 | } | |||
| 28190 | // ADC/ADCX/SBB | |||
| 28191 | case ADX: { | |||
| 28192 | SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32); | |||
| 28193 | SDVTList VTs = DAG.getVTList(Op.getOperand(2).getValueType(), MVT::i32); | |||
| 28194 | ||||
| 28195 | SDValue Res; | |||
| 28196 | // If the carry in is zero, then we should just use ADD/SUB instead of | |||
| 28197 | // ADC/SBB. | |||
| 28198 | if (isNullConstant(Op.getOperand(1))) { | |||
| 28199 | Res = DAG.getNode(IntrData->Opc1, dl, VTs, Op.getOperand(2), | |||
| 28200 | Op.getOperand(3)); | |||
| 28201 | } else { | |||
| 28202 | SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(1), | |||
| 28203 | DAG.getConstant(-1, dl, MVT::i8)); | |||
| 28204 | Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(2), | |||
| 28205 | Op.getOperand(3), GenCF.getValue(1)); | |||
| 28206 | } | |||
| 28207 | SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG); | |||
| 28208 | SDValue Results[] = { SetCC, Res }; | |||
| 28209 | return DAG.getMergeValues(Results, dl); | |||
| 28210 | } | |||
| 28211 | case CVTPD2PS_MASK: | |||
| 28212 | case CVTPD2DQ_MASK: | |||
| 28213 | case CVTQQ2PS_MASK: | |||
| 28214 | case TRUNCATE_TO_REG: { | |||
| 28215 | SDValue Src = Op.getOperand(1); | |||
| 28216 | SDValue PassThru = Op.getOperand(2); | |||
| 28217 | SDValue Mask = Op.getOperand(3); | |||
| 28218 | ||||
| 28219 | if (isAllOnesConstant(Mask)) | |||
| 28220 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); | |||
| 28221 | ||||
| 28222 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 28223 | MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); | |||
| 28224 | Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 28225 | return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), | |||
| 28226 | {Src, PassThru, Mask}); | |||
| 28227 | } | |||
| 28228 | case CVTPS2PH_MASK: { | |||
| 28229 | SDValue Src = Op.getOperand(1); | |||
| 28230 | SDValue Rnd = Op.getOperand(2); | |||
| 28231 | SDValue PassThru = Op.getOperand(3); | |||
| 28232 | SDValue Mask = Op.getOperand(4); | |||
| 28233 | ||||
| 28234 | unsigned RC = 0; | |||
| 28235 | unsigned Opc = IntrData->Opc0; | |||
| 28236 | bool SAE = Src.getValueType().is512BitVector() && | |||
| 28237 | (isRoundModeSAEToX(Rnd, RC) || isRoundModeSAE(Rnd)); | |||
| 28238 | if (SAE) { | |||
| 28239 | Opc = X86ISD::CVTPS2PH_SAE; | |||
| 28240 | Rnd = DAG.getTargetConstant(RC, dl, MVT::i32); | |||
| 28241 | } | |||
| 28242 | ||||
| 28243 | if (isAllOnesConstant(Mask)) | |||
| 28244 | return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd); | |||
| 28245 | ||||
| 28246 | if (SAE) | |||
| 28247 | Opc = X86ISD::MCVTPS2PH_SAE; | |||
| 28248 | else | |||
| 28249 | Opc = IntrData->Opc1; | |||
| 28250 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 28251 | MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements()); | |||
| 28252 | Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 28253 | return DAG.getNode(Opc, dl, Op.getValueType(), Src, Rnd, PassThru, Mask); | |||
| 28254 | } | |||
| 28255 | case CVTNEPS2BF16_MASK: { | |||
| 28256 | SDValue Src = Op.getOperand(1); | |||
| 28257 | SDValue PassThru = Op.getOperand(2); | |||
| 28258 | SDValue Mask = Op.getOperand(3); | |||
| 28259 | ||||
| 28260 | if (ISD::isBuildVectorAllOnes(Mask.getNode())) | |||
| 28261 | return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src); | |||
| 28262 | ||||
| 28263 | // Break false dependency. | |||
| 28264 | if (PassThru.isUndef()) | |||
| 28265 | PassThru = DAG.getConstant(0, dl, PassThru.getValueType()); | |||
| 28266 | ||||
| 28267 | return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru, | |||
| 28268 | Mask); | |||
| 28269 | } | |||
| 28270 | default: | |||
| 28271 | break; | |||
| 28272 | } | |||
| 28273 | } | |||
| 28274 | ||||
| 28275 | switch (IntNo) { | |||
| 28276 | default: return SDValue(); // Don't custom lower most intrinsics. | |||
| 28277 | ||||
| 28278 | // ptest and testp intrinsics. The intrinsic these come from are designed to | |||
| 28279 | // return an integer value, not just an instruction so lower it to the ptest | |||
| 28280 | // or testp pattern and a setcc for the result. | |||
| 28281 | case Intrinsic::x86_avx512_ktestc_b: | |||
| 28282 | case Intrinsic::x86_avx512_ktestc_w: | |||
| 28283 | case Intrinsic::x86_avx512_ktestc_d: | |||
| 28284 | case Intrinsic::x86_avx512_ktestc_q: | |||
| 28285 | case Intrinsic::x86_avx512_ktestz_b: | |||
| 28286 | case Intrinsic::x86_avx512_ktestz_w: | |||
| 28287 | case Intrinsic::x86_avx512_ktestz_d: | |||
| 28288 | case Intrinsic::x86_avx512_ktestz_q: | |||
| 28289 | case Intrinsic::x86_sse41_ptestz: | |||
| 28290 | case Intrinsic::x86_sse41_ptestc: | |||
| 28291 | case Intrinsic::x86_sse41_ptestnzc: | |||
| 28292 | case Intrinsic::x86_avx_ptestz_256: | |||
| 28293 | case Intrinsic::x86_avx_ptestc_256: | |||
| 28294 | case Intrinsic::x86_avx_ptestnzc_256: | |||
| 28295 | case Intrinsic::x86_avx_vtestz_ps: | |||
| 28296 | case Intrinsic::x86_avx_vtestc_ps: | |||
| 28297 | case Intrinsic::x86_avx_vtestnzc_ps: | |||
| 28298 | case Intrinsic::x86_avx_vtestz_pd: | |||
| 28299 | case Intrinsic::x86_avx_vtestc_pd: | |||
| 28300 | case Intrinsic::x86_avx_vtestnzc_pd: | |||
| 28301 | case Intrinsic::x86_avx_vtestz_ps_256: | |||
| 28302 | case Intrinsic::x86_avx_vtestc_ps_256: | |||
| 28303 | case Intrinsic::x86_avx_vtestnzc_ps_256: | |||
| 28304 | case Intrinsic::x86_avx_vtestz_pd_256: | |||
| 28305 | case Intrinsic::x86_avx_vtestc_pd_256: | |||
| 28306 | case Intrinsic::x86_avx_vtestnzc_pd_256: { | |||
| 28307 | unsigned TestOpc = X86ISD::PTEST; | |||
| 28308 | X86::CondCode X86CC; | |||
| 28309 | switch (IntNo) { | |||
| 28310 | default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.")::llvm::llvm_unreachable_internal("Bad fallthrough in Intrinsic lowering." , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28310); | |||
| 28311 | case Intrinsic::x86_avx512_ktestc_b: | |||
| 28312 | case Intrinsic::x86_avx512_ktestc_w: | |||
| 28313 | case Intrinsic::x86_avx512_ktestc_d: | |||
| 28314 | case Intrinsic::x86_avx512_ktestc_q: | |||
| 28315 | // CF = 1 | |||
| 28316 | TestOpc = X86ISD::KTEST; | |||
| 28317 | X86CC = X86::COND_B; | |||
| 28318 | break; | |||
| 28319 | case Intrinsic::x86_avx512_ktestz_b: | |||
| 28320 | case Intrinsic::x86_avx512_ktestz_w: | |||
| 28321 | case Intrinsic::x86_avx512_ktestz_d: | |||
| 28322 | case Intrinsic::x86_avx512_ktestz_q: | |||
| 28323 | TestOpc = X86ISD::KTEST; | |||
| 28324 | X86CC = X86::COND_E; | |||
| 28325 | break; | |||
| 28326 | case Intrinsic::x86_avx_vtestz_ps: | |||
| 28327 | case Intrinsic::x86_avx_vtestz_pd: | |||
| 28328 | case Intrinsic::x86_avx_vtestz_ps_256: | |||
| 28329 | case Intrinsic::x86_avx_vtestz_pd_256: | |||
| 28330 | TestOpc = X86ISD::TESTP; | |||
| 28331 | [[fallthrough]]; | |||
| 28332 | case Intrinsic::x86_sse41_ptestz: | |||
| 28333 | case Intrinsic::x86_avx_ptestz_256: | |||
| 28334 | // ZF = 1 | |||
| 28335 | X86CC = X86::COND_E; | |||
| 28336 | break; | |||
| 28337 | case Intrinsic::x86_avx_vtestc_ps: | |||
| 28338 | case Intrinsic::x86_avx_vtestc_pd: | |||
| 28339 | case Intrinsic::x86_avx_vtestc_ps_256: | |||
| 28340 | case Intrinsic::x86_avx_vtestc_pd_256: | |||
| 28341 | TestOpc = X86ISD::TESTP; | |||
| 28342 | [[fallthrough]]; | |||
| 28343 | case Intrinsic::x86_sse41_ptestc: | |||
| 28344 | case Intrinsic::x86_avx_ptestc_256: | |||
| 28345 | // CF = 1 | |||
| 28346 | X86CC = X86::COND_B; | |||
| 28347 | break; | |||
| 28348 | case Intrinsic::x86_avx_vtestnzc_ps: | |||
| 28349 | case Intrinsic::x86_avx_vtestnzc_pd: | |||
| 28350 | case Intrinsic::x86_avx_vtestnzc_ps_256: | |||
| 28351 | case Intrinsic::x86_avx_vtestnzc_pd_256: | |||
| 28352 | TestOpc = X86ISD::TESTP; | |||
| 28353 | [[fallthrough]]; | |||
| 28354 | case Intrinsic::x86_sse41_ptestnzc: | |||
| 28355 | case Intrinsic::x86_avx_ptestnzc_256: | |||
| 28356 | // ZF and CF = 0 | |||
| 28357 | X86CC = X86::COND_A; | |||
| 28358 | break; | |||
| 28359 | } | |||
| 28360 | ||||
| 28361 | SDValue LHS = Op.getOperand(1); | |||
| 28362 | SDValue RHS = Op.getOperand(2); | |||
| 28363 | SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS); | |||
| 28364 | SDValue SetCC = getSETCC(X86CC, Test, dl, DAG); | |||
| 28365 | return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); | |||
| 28366 | } | |||
| 28367 | ||||
| 28368 | case Intrinsic::x86_sse42_pcmpistria128: | |||
| 28369 | case Intrinsic::x86_sse42_pcmpestria128: | |||
| 28370 | case Intrinsic::x86_sse42_pcmpistric128: | |||
| 28371 | case Intrinsic::x86_sse42_pcmpestric128: | |||
| 28372 | case Intrinsic::x86_sse42_pcmpistrio128: | |||
| 28373 | case Intrinsic::x86_sse42_pcmpestrio128: | |||
| 28374 | case Intrinsic::x86_sse42_pcmpistris128: | |||
| 28375 | case Intrinsic::x86_sse42_pcmpestris128: | |||
| 28376 | case Intrinsic::x86_sse42_pcmpistriz128: | |||
| 28377 | case Intrinsic::x86_sse42_pcmpestriz128: { | |||
| 28378 | unsigned Opcode; | |||
| 28379 | X86::CondCode X86CC; | |||
| 28380 | switch (IntNo) { | |||
| 28381 | default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 28381); // Can't reach here. | |||
| 28382 | case Intrinsic::x86_sse42_pcmpistria128: | |||
| 28383 | Opcode = X86ISD::PCMPISTR; | |||
| 28384 | X86CC = X86::COND_A; | |||
| 28385 | break; | |||
| 28386 | case Intrinsic::x86_sse42_pcmpestria128: | |||
| 28387 | Opcode = X86ISD::PCMPESTR; | |||
| 28388 | X86CC = X86::COND_A; | |||
| 28389 | break; | |||
| 28390 | case Intrinsic::x86_sse42_pcmpistric128: | |||
| 28391 | Opcode = X86ISD::PCMPISTR; | |||
| 28392 | X86CC = X86::COND_B; | |||
| 28393 | break; | |||
| 28394 | case Intrinsic::x86_sse42_pcmpestric128: | |||
| 28395 | Opcode = X86ISD::PCMPESTR; | |||
| 28396 | X86CC = X86::COND_B; | |||
| 28397 | break; | |||
| 28398 | case Intrinsic::x86_sse42_pcmpistrio128: | |||
| 28399 | Opcode = X86ISD::PCMPISTR; | |||
| 28400 | X86CC = X86::COND_O; | |||
| 28401 | break; | |||
| 28402 | case Intrinsic::x86_sse42_pcmpestrio128: | |||
| 28403 | Opcode = X86ISD::PCMPESTR; | |||
| 28404 | X86CC = X86::COND_O; | |||
| 28405 | break; | |||
| 28406 | case Intrinsic::x86_sse42_pcmpistris128: | |||
| 28407 | Opcode = X86ISD::PCMPISTR; | |||
| 28408 | X86CC = X86::COND_S; | |||
| 28409 | break; | |||
| 28410 | case Intrinsic::x86_sse42_pcmpestris128: | |||
| 28411 | Opcode = X86ISD::PCMPESTR; | |||
| 28412 | X86CC = X86::COND_S; | |||
| 28413 | break; | |||
| 28414 | case Intrinsic::x86_sse42_pcmpistriz128: | |||
| 28415 | Opcode = X86ISD::PCMPISTR; | |||
| 28416 | X86CC = X86::COND_E; | |||
| 28417 | break; | |||
| 28418 | case Intrinsic::x86_sse42_pcmpestriz128: | |||
| 28419 | Opcode = X86ISD::PCMPESTR; | |||
| 28420 | X86CC = X86::COND_E; | |||
| 28421 | break; | |||
| 28422 | } | |||
| 28423 | SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); | |||
| 28424 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); | |||
| 28425 | SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps).getValue(2); | |||
| 28426 | SDValue SetCC = getSETCC(X86CC, PCMP, dl, DAG); | |||
| 28427 | return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC); | |||
| 28428 | } | |||
| 28429 | ||||
| 28430 | case Intrinsic::x86_sse42_pcmpistri128: | |||
| 28431 | case Intrinsic::x86_sse42_pcmpestri128: { | |||
| 28432 | unsigned Opcode; | |||
| 28433 | if (IntNo == Intrinsic::x86_sse42_pcmpistri128) | |||
| 28434 | Opcode = X86ISD::PCMPISTR; | |||
| 28435 | else | |||
| 28436 | Opcode = X86ISD::PCMPESTR; | |||
| 28437 | ||||
| 28438 | SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); | |||
| 28439 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); | |||
| 28440 | return DAG.getNode(Opcode, dl, VTs, NewOps); | |||
| 28441 | } | |||
| 28442 | ||||
| 28443 | case Intrinsic::x86_sse42_pcmpistrm128: | |||
| 28444 | case Intrinsic::x86_sse42_pcmpestrm128: { | |||
| 28445 | unsigned Opcode; | |||
| 28446 | if (IntNo == Intrinsic::x86_sse42_pcmpistrm128) | |||
| 28447 | Opcode = X86ISD::PCMPISTR; | |||
| 28448 | else | |||
| 28449 | Opcode = X86ISD::PCMPESTR; | |||
| 28450 | ||||
| 28451 | SmallVector<SDValue, 5> NewOps(llvm::drop_begin(Op->ops())); | |||
| 28452 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::v16i8, MVT::i32); | |||
| 28453 | return DAG.getNode(Opcode, dl, VTs, NewOps).getValue(1); | |||
| 28454 | } | |||
| 28455 | ||||
| 28456 | case Intrinsic::eh_sjlj_lsda: { | |||
| 28457 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 28458 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 28459 | MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 28460 | auto &Context = MF.getMMI().getContext(); | |||
| 28461 | MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + | |||
| 28462 | Twine(MF.getFunctionNumber())); | |||
| 28463 | return DAG.getNode(getGlobalWrapperKind(), dl, VT, | |||
| 28464 | DAG.getMCSymbol(S, PtrVT)); | |||
| 28465 | } | |||
| 28466 | ||||
| 28467 | case Intrinsic::x86_seh_lsda: { | |||
| 28468 | // Compute the symbol for the LSDA. We know it'll get emitted later. | |||
| 28469 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 28470 | SDValue Op1 = Op.getOperand(1); | |||
| 28471 | auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal()); | |||
| 28472 | MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( | |||
| 28473 | GlobalValue::dropLLVMManglingEscape(Fn->getName())); | |||
| 28474 | ||||
| 28475 | // Generate a simple absolute symbol reference. This intrinsic is only | |||
| 28476 | // supported on 32-bit Windows, which isn't PIC. | |||
| 28477 | SDValue Result = DAG.getMCSymbol(LSDASym, VT); | |||
| 28478 | return DAG.getNode(X86ISD::Wrapper, dl, VT, Result); | |||
| 28479 | } | |||
| 28480 | ||||
| 28481 | case Intrinsic::eh_recoverfp: { | |||
| 28482 | SDValue FnOp = Op.getOperand(1); | |||
| 28483 | SDValue IncomingFPOp = Op.getOperand(2); | |||
| 28484 | GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp); | |||
| 28485 | auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr); | |||
| 28486 | if (!Fn) | |||
| 28487 | report_fatal_error( | |||
| 28488 | "llvm.eh.recoverfp must take a function as the first argument"); | |||
| 28489 | return recoverFramePointer(DAG, Fn, IncomingFPOp); | |||
| 28490 | } | |||
| 28491 | ||||
| 28492 | case Intrinsic::localaddress: { | |||
| 28493 | // Returns one of the stack, base, or frame pointer registers, depending on | |||
| 28494 | // which is used to reference local variables. | |||
| 28495 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 28496 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 28497 | unsigned Reg; | |||
| 28498 | if (RegInfo->hasBasePointer(MF)) | |||
| 28499 | Reg = RegInfo->getBaseRegister(); | |||
| 28500 | else { // Handles the SP or FP case. | |||
| 28501 | bool CantUseFP = RegInfo->hasStackRealignment(MF); | |||
| 28502 | if (CantUseFP) | |||
| 28503 | Reg = RegInfo->getPtrSizedStackRegister(MF); | |||
| 28504 | else | |||
| 28505 | Reg = RegInfo->getPtrSizedFrameRegister(MF); | |||
| 28506 | } | |||
| 28507 | return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT); | |||
| 28508 | } | |||
| 28509 | case Intrinsic::x86_avx512_vp2intersect_q_512: | |||
| 28510 | case Intrinsic::x86_avx512_vp2intersect_q_256: | |||
| 28511 | case Intrinsic::x86_avx512_vp2intersect_q_128: | |||
| 28512 | case Intrinsic::x86_avx512_vp2intersect_d_512: | |||
| 28513 | case Intrinsic::x86_avx512_vp2intersect_d_256: | |||
| 28514 | case Intrinsic::x86_avx512_vp2intersect_d_128: { | |||
| 28515 | MVT MaskVT = Op.getSimpleValueType(); | |||
| 28516 | ||||
| 28517 | SDVTList VTs = DAG.getVTList(MVT::Untyped, MVT::Other); | |||
| 28518 | SDLoc DL(Op); | |||
| 28519 | ||||
| 28520 | SDValue Operation = | |||
| 28521 | DAG.getNode(X86ISD::VP2INTERSECT, DL, VTs, | |||
| 28522 | Op->getOperand(1), Op->getOperand(2)); | |||
| 28523 | ||||
| 28524 | SDValue Result0 = DAG.getTargetExtractSubreg(X86::sub_mask_0, DL, | |||
| 28525 | MaskVT, Operation); | |||
| 28526 | SDValue Result1 = DAG.getTargetExtractSubreg(X86::sub_mask_1, DL, | |||
| 28527 | MaskVT, Operation); | |||
| 28528 | return DAG.getMergeValues({Result0, Result1}, DL); | |||
| 28529 | } | |||
| 28530 | case Intrinsic::x86_mmx_pslli_w: | |||
| 28531 | case Intrinsic::x86_mmx_pslli_d: | |||
| 28532 | case Intrinsic::x86_mmx_pslli_q: | |||
| 28533 | case Intrinsic::x86_mmx_psrli_w: | |||
| 28534 | case Intrinsic::x86_mmx_psrli_d: | |||
| 28535 | case Intrinsic::x86_mmx_psrli_q: | |||
| 28536 | case Intrinsic::x86_mmx_psrai_w: | |||
| 28537 | case Intrinsic::x86_mmx_psrai_d: { | |||
| 28538 | SDLoc DL(Op); | |||
| 28539 | SDValue ShAmt = Op.getOperand(2); | |||
| 28540 | // If the argument is a constant, convert it to a target constant. | |||
| 28541 | if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) { | |||
| 28542 | // Clamp out of bounds shift amounts since they will otherwise be masked | |||
| 28543 | // to 8-bits which may make it no longer out of bounds. | |||
| 28544 | unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255); | |||
| 28545 | if (ShiftAmount == 0) | |||
| 28546 | return Op.getOperand(1); | |||
| 28547 | ||||
| 28548 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), | |||
| 28549 | Op.getOperand(0), Op.getOperand(1), | |||
| 28550 | DAG.getTargetConstant(ShiftAmount, DL, MVT::i32)); | |||
| 28551 | } | |||
| 28552 | ||||
| 28553 | unsigned NewIntrinsic; | |||
| 28554 | switch (IntNo) { | |||
| 28555 | default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 28555); // Can't reach here. | |||
| 28556 | case Intrinsic::x86_mmx_pslli_w: | |||
| 28557 | NewIntrinsic = Intrinsic::x86_mmx_psll_w; | |||
| 28558 | break; | |||
| 28559 | case Intrinsic::x86_mmx_pslli_d: | |||
| 28560 | NewIntrinsic = Intrinsic::x86_mmx_psll_d; | |||
| 28561 | break; | |||
| 28562 | case Intrinsic::x86_mmx_pslli_q: | |||
| 28563 | NewIntrinsic = Intrinsic::x86_mmx_psll_q; | |||
| 28564 | break; | |||
| 28565 | case Intrinsic::x86_mmx_psrli_w: | |||
| 28566 | NewIntrinsic = Intrinsic::x86_mmx_psrl_w; | |||
| 28567 | break; | |||
| 28568 | case Intrinsic::x86_mmx_psrli_d: | |||
| 28569 | NewIntrinsic = Intrinsic::x86_mmx_psrl_d; | |||
| 28570 | break; | |||
| 28571 | case Intrinsic::x86_mmx_psrli_q: | |||
| 28572 | NewIntrinsic = Intrinsic::x86_mmx_psrl_q; | |||
| 28573 | break; | |||
| 28574 | case Intrinsic::x86_mmx_psrai_w: | |||
| 28575 | NewIntrinsic = Intrinsic::x86_mmx_psra_w; | |||
| 28576 | break; | |||
| 28577 | case Intrinsic::x86_mmx_psrai_d: | |||
| 28578 | NewIntrinsic = Intrinsic::x86_mmx_psra_d; | |||
| 28579 | break; | |||
| 28580 | } | |||
| 28581 | ||||
| 28582 | // The vector shift intrinsics with scalars uses 32b shift amounts but | |||
| 28583 | // the sse2/mmx shift instructions reads 64 bits. Copy the 32 bits to an | |||
| 28584 | // MMX register. | |||
| 28585 | ShAmt = DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, ShAmt); | |||
| 28586 | return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(), | |||
| 28587 | DAG.getTargetConstant(NewIntrinsic, DL, | |||
| 28588 | getPointerTy(DAG.getDataLayout())), | |||
| 28589 | Op.getOperand(1), ShAmt); | |||
| 28590 | } | |||
| 28591 | case Intrinsic::thread_pointer: { | |||
| 28592 | if (Subtarget.isTargetELF()) { | |||
| 28593 | SDLoc dl(Op); | |||
| 28594 | EVT PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 28595 | // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit). | |||
| 28596 | Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy( | |||
| 28597 | *DAG.getContext(), Subtarget.is64Bit() ? X86AS::FS : X86AS::GS)); | |||
| 28598 | return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), | |||
| 28599 | DAG.getIntPtrConstant(0, dl), MachinePointerInfo(Ptr)); | |||
| 28600 | } | |||
| 28601 | report_fatal_error( | |||
| 28602 | "Target OS doesn't support __builtin_thread_pointer() yet."); | |||
| 28603 | } | |||
| 28604 | } | |||
| 28605 | } | |||
| 28606 | ||||
| 28607 | static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, | |||
| 28608 | SDValue Src, SDValue Mask, SDValue Base, | |||
| 28609 | SDValue Index, SDValue ScaleOp, SDValue Chain, | |||
| 28610 | const X86Subtarget &Subtarget) { | |||
| 28611 | SDLoc dl(Op); | |||
| 28612 | auto *C = dyn_cast<ConstantSDNode>(ScaleOp); | |||
| 28613 | // Scale must be constant. | |||
| 28614 | if (!C) | |||
| 28615 | return SDValue(); | |||
| 28616 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 28617 | SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, | |||
| 28618 | TLI.getPointerTy(DAG.getDataLayout())); | |||
| 28619 | EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger(); | |||
| 28620 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); | |||
| 28621 | // If source is undef or we know it won't be used, use a zero vector | |||
| 28622 | // to break register dependency. | |||
| 28623 | // TODO: use undef instead and let BreakFalseDeps deal with it? | |||
| 28624 | if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) | |||
| 28625 | Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); | |||
| 28626 | ||||
| 28627 | // Cast mask to an integer type. | |||
| 28628 | Mask = DAG.getBitcast(MaskVT, Mask); | |||
| 28629 | ||||
| 28630 | MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 28631 | ||||
| 28632 | SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; | |||
| 28633 | SDValue Res = | |||
| 28634 | DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, | |||
| 28635 | MemIntr->getMemoryVT(), MemIntr->getMemOperand()); | |||
| 28636 | return DAG.getMergeValues({Res, Res.getValue(1)}, dl); | |||
| 28637 | } | |||
| 28638 | ||||
| 28639 | static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG, | |||
| 28640 | SDValue Src, SDValue Mask, SDValue Base, | |||
| 28641 | SDValue Index, SDValue ScaleOp, SDValue Chain, | |||
| 28642 | const X86Subtarget &Subtarget) { | |||
| 28643 | MVT VT = Op.getSimpleValueType(); | |||
| 28644 | SDLoc dl(Op); | |||
| 28645 | auto *C = dyn_cast<ConstantSDNode>(ScaleOp); | |||
| 28646 | // Scale must be constant. | |||
| 28647 | if (!C) | |||
| 28648 | return SDValue(); | |||
| 28649 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 28650 | SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, | |||
| 28651 | TLI.getPointerTy(DAG.getDataLayout())); | |||
| 28652 | unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), | |||
| 28653 | VT.getVectorNumElements()); | |||
| 28654 | MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); | |||
| 28655 | ||||
| 28656 | // We support two versions of the gather intrinsics. One with scalar mask and | |||
| 28657 | // one with vXi1 mask. Convert scalar to vXi1 if necessary. | |||
| 28658 | if (Mask.getValueType() != MaskVT) | |||
| 28659 | Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 28660 | ||||
| 28661 | SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other); | |||
| 28662 | // If source is undef or we know it won't be used, use a zero vector | |||
| 28663 | // to break register dependency. | |||
| 28664 | // TODO: use undef instead and let BreakFalseDeps deal with it? | |||
| 28665 | if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode())) | |||
| 28666 | Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl); | |||
| 28667 | ||||
| 28668 | MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 28669 | ||||
| 28670 | SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale }; | |||
| 28671 | SDValue Res = | |||
| 28672 | DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops, | |||
| 28673 | MemIntr->getMemoryVT(), MemIntr->getMemOperand()); | |||
| 28674 | return DAG.getMergeValues({Res, Res.getValue(1)}, dl); | |||
| 28675 | } | |||
| 28676 | ||||
| 28677 | static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, | |||
| 28678 | SDValue Src, SDValue Mask, SDValue Base, | |||
| 28679 | SDValue Index, SDValue ScaleOp, SDValue Chain, | |||
| 28680 | const X86Subtarget &Subtarget) { | |||
| 28681 | SDLoc dl(Op); | |||
| 28682 | auto *C = dyn_cast<ConstantSDNode>(ScaleOp); | |||
| 28683 | // Scale must be constant. | |||
| 28684 | if (!C) | |||
| 28685 | return SDValue(); | |||
| 28686 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 28687 | SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, | |||
| 28688 | TLI.getPointerTy(DAG.getDataLayout())); | |||
| 28689 | unsigned MinElts = std::min(Index.getSimpleValueType().getVectorNumElements(), | |||
| 28690 | Src.getSimpleValueType().getVectorNumElements()); | |||
| 28691 | MVT MaskVT = MVT::getVectorVT(MVT::i1, MinElts); | |||
| 28692 | ||||
| 28693 | // We support two versions of the scatter intrinsics. One with scalar mask and | |||
| 28694 | // one with vXi1 mask. Convert scalar to vXi1 if necessary. | |||
| 28695 | if (Mask.getValueType() != MaskVT) | |||
| 28696 | Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 28697 | ||||
| 28698 | MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 28699 | ||||
| 28700 | SDVTList VTs = DAG.getVTList(MVT::Other); | |||
| 28701 | SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale}; | |||
| 28702 | SDValue Res = | |||
| 28703 | DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, | |||
| 28704 | MemIntr->getMemoryVT(), MemIntr->getMemOperand()); | |||
| 28705 | return Res; | |||
| 28706 | } | |||
| 28707 | ||||
| 28708 | static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, | |||
| 28709 | SDValue Mask, SDValue Base, SDValue Index, | |||
| 28710 | SDValue ScaleOp, SDValue Chain, | |||
| 28711 | const X86Subtarget &Subtarget) { | |||
| 28712 | SDLoc dl(Op); | |||
| 28713 | auto *C = dyn_cast<ConstantSDNode>(ScaleOp); | |||
| 28714 | // Scale must be constant. | |||
| 28715 | if (!C) | |||
| 28716 | return SDValue(); | |||
| 28717 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 28718 | SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, | |||
| 28719 | TLI.getPointerTy(DAG.getDataLayout())); | |||
| 28720 | SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); | |||
| 28721 | SDValue Segment = DAG.getRegister(0, MVT::i32); | |||
| 28722 | MVT MaskVT = | |||
| 28723 | MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); | |||
| 28724 | SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 28725 | SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain}; | |||
| 28726 | SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops); | |||
| 28727 | return SDValue(Res, 0); | |||
| 28728 | } | |||
| 28729 | ||||
| 28730 | /// Handles the lowering of builtin intrinsics with chain that return their | |||
| 28731 | /// value into registers EDX:EAX. | |||
| 28732 | /// If operand ScrReg is a valid register identifier, then operand 2 of N is | |||
| 28733 | /// copied to SrcReg. The assumption is that SrcReg is an implicit input to | |||
| 28734 | /// TargetOpcode. | |||
| 28735 | /// Returns a Glue value which can be used to add extra copy-from-reg if the | |||
| 28736 | /// expanded intrinsics implicitly defines extra registers (i.e. not just | |||
| 28737 | /// EDX:EAX). | |||
| 28738 | static SDValue expandIntrinsicWChainHelper(SDNode *N, const SDLoc &DL, | |||
| 28739 | SelectionDAG &DAG, | |||
| 28740 | unsigned TargetOpcode, | |||
| 28741 | unsigned SrcReg, | |||
| 28742 | const X86Subtarget &Subtarget, | |||
| 28743 | SmallVectorImpl<SDValue> &Results) { | |||
| 28744 | SDValue Chain = N->getOperand(0); | |||
| 28745 | SDValue Glue; | |||
| 28746 | ||||
| 28747 | if (SrcReg) { | |||
| 28748 | assert(N->getNumOperands() == 3 && "Unexpected number of operands!")(static_cast <bool> (N->getNumOperands() == 3 && "Unexpected number of operands!") ? void (0) : __assert_fail ("N->getNumOperands() == 3 && \"Unexpected number of operands!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 28748, __extension__ __PRETTY_FUNCTION__)); | |||
| 28749 | Chain = DAG.getCopyToReg(Chain, DL, SrcReg, N->getOperand(2), Glue); | |||
| 28750 | Glue = Chain.getValue(1); | |||
| 28751 | } | |||
| 28752 | ||||
| 28753 | SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 28754 | SDValue N1Ops[] = {Chain, Glue}; | |||
| 28755 | SDNode *N1 = DAG.getMachineNode( | |||
| 28756 | TargetOpcode, DL, Tys, ArrayRef<SDValue>(N1Ops, Glue.getNode() ? 2 : 1)); | |||
| 28757 | Chain = SDValue(N1, 0); | |||
| 28758 | ||||
| 28759 | // Reads the content of XCR and returns it in registers EDX:EAX. | |||
| 28760 | SDValue LO, HI; | |||
| 28761 | if (Subtarget.is64Bit()) { | |||
| 28762 | LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1)); | |||
| 28763 | HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64, | |||
| 28764 | LO.getValue(2)); | |||
| 28765 | } else { | |||
| 28766 | LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1)); | |||
| 28767 | HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32, | |||
| 28768 | LO.getValue(2)); | |||
| 28769 | } | |||
| 28770 | Chain = HI.getValue(1); | |||
| 28771 | Glue = HI.getValue(2); | |||
| 28772 | ||||
| 28773 | if (Subtarget.is64Bit()) { | |||
| 28774 | // Merge the two 32-bit values into a 64-bit one. | |||
| 28775 | SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI, | |||
| 28776 | DAG.getConstant(32, DL, MVT::i8)); | |||
| 28777 | Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp)); | |||
| 28778 | Results.push_back(Chain); | |||
| 28779 | return Glue; | |||
| 28780 | } | |||
| 28781 | ||||
| 28782 | // Use a buildpair to merge the two 32-bit values into a 64-bit one. | |||
| 28783 | SDValue Ops[] = { LO, HI }; | |||
| 28784 | SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops); | |||
| 28785 | Results.push_back(Pair); | |||
| 28786 | Results.push_back(Chain); | |||
| 28787 | return Glue; | |||
| 28788 | } | |||
| 28789 | ||||
| 28790 | /// Handles the lowering of builtin intrinsics that read the time stamp counter | |||
| 28791 | /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower | |||
| 28792 | /// READCYCLECOUNTER nodes. | |||
| 28793 | static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode, | |||
| 28794 | SelectionDAG &DAG, | |||
| 28795 | const X86Subtarget &Subtarget, | |||
| 28796 | SmallVectorImpl<SDValue> &Results) { | |||
| 28797 | // The processor's time-stamp counter (a 64-bit MSR) is stored into the | |||
| 28798 | // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR | |||
| 28799 | // and the EAX register is loaded with the low-order 32 bits. | |||
| 28800 | SDValue Glue = expandIntrinsicWChainHelper(N, DL, DAG, Opcode, | |||
| 28801 | /* NoRegister */0, Subtarget, | |||
| 28802 | Results); | |||
| 28803 | if (Opcode != X86::RDTSCP) | |||
| 28804 | return; | |||
| 28805 | ||||
| 28806 | SDValue Chain = Results[1]; | |||
| 28807 | // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into | |||
| 28808 | // the ECX register. Add 'ecx' explicitly to the chain. | |||
| 28809 | SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32, Glue); | |||
| 28810 | Results[1] = ecx; | |||
| 28811 | Results.push_back(ecx.getValue(1)); | |||
| 28812 | } | |||
| 28813 | ||||
| 28814 | static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget, | |||
| 28815 | SelectionDAG &DAG) { | |||
| 28816 | SmallVector<SDValue, 3> Results; | |||
| 28817 | SDLoc DL(Op); | |||
| 28818 | getReadTimeStampCounter(Op.getNode(), DL, X86::RDTSC, DAG, Subtarget, | |||
| 28819 | Results); | |||
| 28820 | return DAG.getMergeValues(Results, DL); | |||
| 28821 | } | |||
| 28822 | ||||
| 28823 | static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) { | |||
| 28824 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 28825 | SDValue Chain = Op.getOperand(0); | |||
| 28826 | SDValue RegNode = Op.getOperand(2); | |||
| 28827 | WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); | |||
| 28828 | if (!EHInfo) | |||
| 28829 | report_fatal_error("EH registrations only live in functions using WinEH"); | |||
| 28830 | ||||
| 28831 | // Cast the operand to an alloca, and remember the frame index. | |||
| 28832 | auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode); | |||
| 28833 | if (!FINode) | |||
| 28834 | report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca"); | |||
| 28835 | EHInfo->EHRegNodeFrameIndex = FINode->getIndex(); | |||
| 28836 | ||||
| 28837 | // Return the chain operand without making any DAG nodes. | |||
| 28838 | return Chain; | |||
| 28839 | } | |||
| 28840 | ||||
| 28841 | static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) { | |||
| 28842 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 28843 | SDValue Chain = Op.getOperand(0); | |||
| 28844 | SDValue EHGuard = Op.getOperand(2); | |||
| 28845 | WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo(); | |||
| 28846 | if (!EHInfo) | |||
| 28847 | report_fatal_error("EHGuard only live in functions using WinEH"); | |||
| 28848 | ||||
| 28849 | // Cast the operand to an alloca, and remember the frame index. | |||
| 28850 | auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard); | |||
| 28851 | if (!FINode) | |||
| 28852 | report_fatal_error("llvm.x86.seh.ehguard expects a static alloca"); | |||
| 28853 | EHInfo->EHGuardFrameIndex = FINode->getIndex(); | |||
| 28854 | ||||
| 28855 | // Return the chain operand without making any DAG nodes. | |||
| 28856 | return Chain; | |||
| 28857 | } | |||
| 28858 | ||||
| 28859 | /// Emit Truncating Store with signed or unsigned saturation. | |||
| 28860 | static SDValue | |||
| 28861 | EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val, | |||
| 28862 | SDValue Ptr, EVT MemVT, MachineMemOperand *MMO, | |||
| 28863 | SelectionDAG &DAG) { | |||
| 28864 | SDVTList VTs = DAG.getVTList(MVT::Other); | |||
| 28865 | SDValue Undef = DAG.getUNDEF(Ptr.getValueType()); | |||
| 28866 | SDValue Ops[] = { Chain, Val, Ptr, Undef }; | |||
| 28867 | unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS; | |||
| 28868 | return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO); | |||
| 28869 | } | |||
| 28870 | ||||
| 28871 | /// Emit Masked Truncating Store with signed or unsigned saturation. | |||
| 28872 | static SDValue | |||
| 28873 | EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, | |||
| 28874 | SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT, | |||
| 28875 | MachineMemOperand *MMO, SelectionDAG &DAG) { | |||
| 28876 | SDVTList VTs = DAG.getVTList(MVT::Other); | |||
| 28877 | SDValue Ops[] = { Chain, Val, Ptr, Mask }; | |||
| 28878 | unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS; | |||
| 28879 | return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO); | |||
| 28880 | } | |||
| 28881 | ||||
| 28882 | static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, | |||
| 28883 | SelectionDAG &DAG) { | |||
| 28884 | unsigned IntNo = Op.getConstantOperandVal(1); | |||
| 28885 | const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo); | |||
| 28886 | if (!IntrData) { | |||
| 28887 | switch (IntNo) { | |||
| 28888 | ||||
| 28889 | case Intrinsic::swift_async_context_addr: { | |||
| 28890 | SDLoc dl(Op); | |||
| 28891 | auto &MF = DAG.getMachineFunction(); | |||
| 28892 | auto X86FI = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 28893 | if (Subtarget.is64Bit()) { | |||
| 28894 | MF.getFrameInfo().setFrameAddressIsTaken(true); | |||
| 28895 | X86FI->setHasSwiftAsyncContext(true); | |||
| 28896 | SDValue Chain = Op->getOperand(0); | |||
| 28897 | SDValue CopyRBP = DAG.getCopyFromReg(Chain, dl, X86::RBP, MVT::i64); | |||
| 28898 | SDValue Result = | |||
| 28899 | SDValue(DAG.getMachineNode(X86::SUB64ri8, dl, MVT::i64, CopyRBP, | |||
| 28900 | DAG.getTargetConstant(8, dl, MVT::i32)), | |||
| 28901 | 0); | |||
| 28902 | // Return { result, chain }. | |||
| 28903 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, | |||
| 28904 | CopyRBP.getValue(1)); | |||
| 28905 | } else { | |||
| 28906 | // 32-bit so no special extended frame, create or reuse an existing | |||
| 28907 | // stack slot. | |||
| 28908 | if (!X86FI->getSwiftAsyncContextFrameIdx()) | |||
| 28909 | X86FI->setSwiftAsyncContextFrameIdx( | |||
| 28910 | MF.getFrameInfo().CreateStackObject(4, Align(4), false)); | |||
| 28911 | SDValue Result = | |||
| 28912 | DAG.getFrameIndex(*X86FI->getSwiftAsyncContextFrameIdx(), MVT::i32); | |||
| 28913 | // Return { result, chain }. | |||
| 28914 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, | |||
| 28915 | Op->getOperand(0)); | |||
| 28916 | } | |||
| 28917 | } | |||
| 28918 | ||||
| 28919 | case llvm::Intrinsic::x86_seh_ehregnode: | |||
| 28920 | return MarkEHRegistrationNode(Op, DAG); | |||
| 28921 | case llvm::Intrinsic::x86_seh_ehguard: | |||
| 28922 | return MarkEHGuard(Op, DAG); | |||
| 28923 | case llvm::Intrinsic::x86_rdpkru: { | |||
| 28924 | SDLoc dl(Op); | |||
| 28925 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); | |||
| 28926 | // Create a RDPKRU node and pass 0 to the ECX parameter. | |||
| 28927 | return DAG.getNode(X86ISD::RDPKRU, dl, VTs, Op.getOperand(0), | |||
| 28928 | DAG.getConstant(0, dl, MVT::i32)); | |||
| 28929 | } | |||
| 28930 | case llvm::Intrinsic::x86_wrpkru: { | |||
| 28931 | SDLoc dl(Op); | |||
| 28932 | // Create a WRPKRU node, pass the input to the EAX parameter, and pass 0 | |||
| 28933 | // to the EDX and ECX parameters. | |||
| 28934 | return DAG.getNode(X86ISD::WRPKRU, dl, MVT::Other, | |||
| 28935 | Op.getOperand(0), Op.getOperand(2), | |||
| 28936 | DAG.getConstant(0, dl, MVT::i32), | |||
| 28937 | DAG.getConstant(0, dl, MVT::i32)); | |||
| 28938 | } | |||
| 28939 | case llvm::Intrinsic::asan_check_memaccess: { | |||
| 28940 | // Mark this as adjustsStack because it will be lowered to a call. | |||
| 28941 | DAG.getMachineFunction().getFrameInfo().setAdjustsStack(true); | |||
| 28942 | // Don't do anything here, we will expand these intrinsics out later. | |||
| 28943 | return Op; | |||
| 28944 | } | |||
| 28945 | case llvm::Intrinsic::x86_flags_read_u32: | |||
| 28946 | case llvm::Intrinsic::x86_flags_read_u64: | |||
| 28947 | case llvm::Intrinsic::x86_flags_write_u32: | |||
| 28948 | case llvm::Intrinsic::x86_flags_write_u64: { | |||
| 28949 | // We need a frame pointer because this will get lowered to a PUSH/POP | |||
| 28950 | // sequence. | |||
| 28951 | MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); | |||
| 28952 | MFI.setHasCopyImplyingStackAdjustment(true); | |||
| 28953 | // Don't do anything here, we will expand these intrinsics out later | |||
| 28954 | // during FinalizeISel in EmitInstrWithCustomInserter. | |||
| 28955 | return Op; | |||
| 28956 | } | |||
| 28957 | case Intrinsic::x86_lwpins32: | |||
| 28958 | case Intrinsic::x86_lwpins64: | |||
| 28959 | case Intrinsic::x86_umwait: | |||
| 28960 | case Intrinsic::x86_tpause: { | |||
| 28961 | SDLoc dl(Op); | |||
| 28962 | SDValue Chain = Op->getOperand(0); | |||
| 28963 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); | |||
| 28964 | unsigned Opcode; | |||
| 28965 | ||||
| 28966 | switch (IntNo) { | |||
| 28967 | default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 28967); | |||
| 28968 | case Intrinsic::x86_umwait: | |||
| 28969 | Opcode = X86ISD::UMWAIT; | |||
| 28970 | break; | |||
| 28971 | case Intrinsic::x86_tpause: | |||
| 28972 | Opcode = X86ISD::TPAUSE; | |||
| 28973 | break; | |||
| 28974 | case Intrinsic::x86_lwpins32: | |||
| 28975 | case Intrinsic::x86_lwpins64: | |||
| 28976 | Opcode = X86ISD::LWPINS; | |||
| 28977 | break; | |||
| 28978 | } | |||
| 28979 | ||||
| 28980 | SDValue Operation = | |||
| 28981 | DAG.getNode(Opcode, dl, VTs, Chain, Op->getOperand(2), | |||
| 28982 | Op->getOperand(3), Op->getOperand(4)); | |||
| 28983 | SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); | |||
| 28984 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, | |||
| 28985 | Operation.getValue(1)); | |||
| 28986 | } | |||
| 28987 | case Intrinsic::x86_enqcmd: | |||
| 28988 | case Intrinsic::x86_enqcmds: { | |||
| 28989 | SDLoc dl(Op); | |||
| 28990 | SDValue Chain = Op.getOperand(0); | |||
| 28991 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); | |||
| 28992 | unsigned Opcode; | |||
| 28993 | switch (IntNo) { | |||
| 28994 | default: llvm_unreachable("Impossible intrinsic!")::llvm::llvm_unreachable_internal("Impossible intrinsic!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 28994); | |||
| 28995 | case Intrinsic::x86_enqcmd: | |||
| 28996 | Opcode = X86ISD::ENQCMD; | |||
| 28997 | break; | |||
| 28998 | case Intrinsic::x86_enqcmds: | |||
| 28999 | Opcode = X86ISD::ENQCMDS; | |||
| 29000 | break; | |||
| 29001 | } | |||
| 29002 | SDValue Operation = DAG.getNode(Opcode, dl, VTs, Chain, Op.getOperand(2), | |||
| 29003 | Op.getOperand(3)); | |||
| 29004 | SDValue SetCC = getSETCC(X86::COND_E, Operation.getValue(0), dl, DAG); | |||
| 29005 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, | |||
| 29006 | Operation.getValue(1)); | |||
| 29007 | } | |||
| 29008 | case Intrinsic::x86_aesenc128kl: | |||
| 29009 | case Intrinsic::x86_aesdec128kl: | |||
| 29010 | case Intrinsic::x86_aesenc256kl: | |||
| 29011 | case Intrinsic::x86_aesdec256kl: { | |||
| 29012 | SDLoc DL(Op); | |||
| 29013 | SDVTList VTs = DAG.getVTList(MVT::v2i64, MVT::i32, MVT::Other); | |||
| 29014 | SDValue Chain = Op.getOperand(0); | |||
| 29015 | unsigned Opcode; | |||
| 29016 | ||||
| 29017 | switch (IntNo) { | |||
| 29018 | default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29018); | |||
| 29019 | case Intrinsic::x86_aesenc128kl: | |||
| 29020 | Opcode = X86ISD::AESENC128KL; | |||
| 29021 | break; | |||
| 29022 | case Intrinsic::x86_aesdec128kl: | |||
| 29023 | Opcode = X86ISD::AESDEC128KL; | |||
| 29024 | break; | |||
| 29025 | case Intrinsic::x86_aesenc256kl: | |||
| 29026 | Opcode = X86ISD::AESENC256KL; | |||
| 29027 | break; | |||
| 29028 | case Intrinsic::x86_aesdec256kl: | |||
| 29029 | Opcode = X86ISD::AESDEC256KL; | |||
| 29030 | break; | |||
| 29031 | } | |||
| 29032 | ||||
| 29033 | MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 29034 | MachineMemOperand *MMO = MemIntr->getMemOperand(); | |||
| 29035 | EVT MemVT = MemIntr->getMemoryVT(); | |||
| 29036 | SDValue Operation = DAG.getMemIntrinsicNode( | |||
| 29037 | Opcode, DL, VTs, {Chain, Op.getOperand(2), Op.getOperand(3)}, MemVT, | |||
| 29038 | MMO); | |||
| 29039 | SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(1), DL, DAG); | |||
| 29040 | ||||
| 29041 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), | |||
| 29042 | {ZF, Operation.getValue(0), Operation.getValue(2)}); | |||
| 29043 | } | |||
| 29044 | case Intrinsic::x86_aesencwide128kl: | |||
| 29045 | case Intrinsic::x86_aesdecwide128kl: | |||
| 29046 | case Intrinsic::x86_aesencwide256kl: | |||
| 29047 | case Intrinsic::x86_aesdecwide256kl: { | |||
| 29048 | SDLoc DL(Op); | |||
| 29049 | SDVTList VTs = DAG.getVTList( | |||
| 29050 | {MVT::i32, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::v2i64, | |||
| 29051 | MVT::v2i64, MVT::v2i64, MVT::v2i64, MVT::Other}); | |||
| 29052 | SDValue Chain = Op.getOperand(0); | |||
| 29053 | unsigned Opcode; | |||
| 29054 | ||||
| 29055 | switch (IntNo) { | |||
| 29056 | default: llvm_unreachable("Impossible intrinsic")::llvm::llvm_unreachable_internal("Impossible intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29056); | |||
| 29057 | case Intrinsic::x86_aesencwide128kl: | |||
| 29058 | Opcode = X86ISD::AESENCWIDE128KL; | |||
| 29059 | break; | |||
| 29060 | case Intrinsic::x86_aesdecwide128kl: | |||
| 29061 | Opcode = X86ISD::AESDECWIDE128KL; | |||
| 29062 | break; | |||
| 29063 | case Intrinsic::x86_aesencwide256kl: | |||
| 29064 | Opcode = X86ISD::AESENCWIDE256KL; | |||
| 29065 | break; | |||
| 29066 | case Intrinsic::x86_aesdecwide256kl: | |||
| 29067 | Opcode = X86ISD::AESDECWIDE256KL; | |||
| 29068 | break; | |||
| 29069 | } | |||
| 29070 | ||||
| 29071 | MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 29072 | MachineMemOperand *MMO = MemIntr->getMemOperand(); | |||
| 29073 | EVT MemVT = MemIntr->getMemoryVT(); | |||
| 29074 | SDValue Operation = DAG.getMemIntrinsicNode( | |||
| 29075 | Opcode, DL, VTs, | |||
| 29076 | {Chain, Op.getOperand(2), Op.getOperand(3), Op.getOperand(4), | |||
| 29077 | Op.getOperand(5), Op.getOperand(6), Op.getOperand(7), | |||
| 29078 | Op.getOperand(8), Op.getOperand(9), Op.getOperand(10)}, | |||
| 29079 | MemVT, MMO); | |||
| 29080 | SDValue ZF = getSETCC(X86::COND_E, Operation.getValue(0), DL, DAG); | |||
| 29081 | ||||
| 29082 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), | |||
| 29083 | {ZF, Operation.getValue(1), Operation.getValue(2), | |||
| 29084 | Operation.getValue(3), Operation.getValue(4), | |||
| 29085 | Operation.getValue(5), Operation.getValue(6), | |||
| 29086 | Operation.getValue(7), Operation.getValue(8), | |||
| 29087 | Operation.getValue(9)}); | |||
| 29088 | } | |||
| 29089 | case Intrinsic::x86_testui: { | |||
| 29090 | SDLoc dl(Op); | |||
| 29091 | SDValue Chain = Op.getOperand(0); | |||
| 29092 | SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other); | |||
| 29093 | SDValue Operation = DAG.getNode(X86ISD::TESTUI, dl, VTs, Chain); | |||
| 29094 | SDValue SetCC = getSETCC(X86::COND_B, Operation.getValue(0), dl, DAG); | |||
| 29095 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), SetCC, | |||
| 29096 | Operation.getValue(1)); | |||
| 29097 | } | |||
| 29098 | case Intrinsic::x86_atomic_bts_rm: | |||
| 29099 | case Intrinsic::x86_atomic_btc_rm: | |||
| 29100 | case Intrinsic::x86_atomic_btr_rm: { | |||
| 29101 | SDLoc DL(Op); | |||
| 29102 | MVT VT = Op.getSimpleValueType(); | |||
| 29103 | SDValue Chain = Op.getOperand(0); | |||
| 29104 | SDValue Op1 = Op.getOperand(2); | |||
| 29105 | SDValue Op2 = Op.getOperand(3); | |||
| 29106 | unsigned Opc = IntNo == Intrinsic::x86_atomic_bts_rm ? X86ISD::LBTS_RM | |||
| 29107 | : IntNo == Intrinsic::x86_atomic_btc_rm ? X86ISD::LBTC_RM | |||
| 29108 | : X86ISD::LBTR_RM; | |||
| 29109 | MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand(); | |||
| 29110 | SDValue Res = | |||
| 29111 | DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), | |||
| 29112 | {Chain, Op1, Op2}, VT, MMO); | |||
| 29113 | Chain = Res.getValue(1); | |||
| 29114 | Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); | |||
| 29115 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); | |||
| 29116 | } | |||
| 29117 | case Intrinsic::x86_atomic_bts: | |||
| 29118 | case Intrinsic::x86_atomic_btc: | |||
| 29119 | case Intrinsic::x86_atomic_btr: { | |||
| 29120 | SDLoc DL(Op); | |||
| 29121 | MVT VT = Op.getSimpleValueType(); | |||
| 29122 | SDValue Chain = Op.getOperand(0); | |||
| 29123 | SDValue Op1 = Op.getOperand(2); | |||
| 29124 | SDValue Op2 = Op.getOperand(3); | |||
| 29125 | unsigned Opc = IntNo == Intrinsic::x86_atomic_bts ? X86ISD::LBTS | |||
| 29126 | : IntNo == Intrinsic::x86_atomic_btc ? X86ISD::LBTC | |||
| 29127 | : X86ISD::LBTR; | |||
| 29128 | SDValue Size = DAG.getConstant(VT.getScalarSizeInBits(), DL, MVT::i32); | |||
| 29129 | MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand(); | |||
| 29130 | SDValue Res = | |||
| 29131 | DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), | |||
| 29132 | {Chain, Op1, Op2, Size}, VT, MMO); | |||
| 29133 | Chain = Res.getValue(1); | |||
| 29134 | Res = DAG.getZExtOrTrunc(getSETCC(X86::COND_B, Res, DL, DAG), DL, VT); | |||
| 29135 | unsigned Imm = cast<ConstantSDNode>(Op2)->getZExtValue(); | |||
| 29136 | if (Imm) | |||
| 29137 | Res = DAG.getNode(ISD::SHL, DL, VT, Res, | |||
| 29138 | DAG.getShiftAmountConstant(Imm, VT, DL)); | |||
| 29139 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), Res, Chain); | |||
| 29140 | } | |||
| 29141 | case Intrinsic::x86_cmpccxadd32: | |||
| 29142 | case Intrinsic::x86_cmpccxadd64: { | |||
| 29143 | SDLoc DL(Op); | |||
| 29144 | SDValue Chain = Op.getOperand(0); | |||
| 29145 | SDValue Addr = Op.getOperand(2); | |||
| 29146 | SDValue Src1 = Op.getOperand(3); | |||
| 29147 | SDValue Src2 = Op.getOperand(4); | |||
| 29148 | SDValue CC = Op.getOperand(5); | |||
| 29149 | MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand(); | |||
| 29150 | SDValue Operation = DAG.getMemIntrinsicNode( | |||
| 29151 | X86ISD::CMPCCXADD, DL, Op->getVTList(), {Chain, Addr, Src1, Src2, CC}, | |||
| 29152 | MVT::i32, MMO); | |||
| 29153 | return Operation; | |||
| 29154 | } | |||
| 29155 | case Intrinsic::x86_aadd32: | |||
| 29156 | case Intrinsic::x86_aadd64: | |||
| 29157 | case Intrinsic::x86_aand32: | |||
| 29158 | case Intrinsic::x86_aand64: | |||
| 29159 | case Intrinsic::x86_aor32: | |||
| 29160 | case Intrinsic::x86_aor64: | |||
| 29161 | case Intrinsic::x86_axor32: | |||
| 29162 | case Intrinsic::x86_axor64: { | |||
| 29163 | SDLoc DL(Op); | |||
| 29164 | SDValue Chain = Op.getOperand(0); | |||
| 29165 | SDValue Op1 = Op.getOperand(2); | |||
| 29166 | SDValue Op2 = Op.getOperand(3); | |||
| 29167 | MVT VT = Op2.getSimpleValueType(); | |||
| 29168 | unsigned Opc = 0; | |||
| 29169 | switch (IntNo) { | |||
| 29170 | default: | |||
| 29171 | llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29171); | |||
| 29172 | case Intrinsic::x86_aadd32: | |||
| 29173 | case Intrinsic::x86_aadd64: | |||
| 29174 | Opc = X86ISD::AADD; | |||
| 29175 | break; | |||
| 29176 | case Intrinsic::x86_aand32: | |||
| 29177 | case Intrinsic::x86_aand64: | |||
| 29178 | Opc = X86ISD::AAND; | |||
| 29179 | break; | |||
| 29180 | case Intrinsic::x86_aor32: | |||
| 29181 | case Intrinsic::x86_aor64: | |||
| 29182 | Opc = X86ISD::AOR; | |||
| 29183 | break; | |||
| 29184 | case Intrinsic::x86_axor32: | |||
| 29185 | case Intrinsic::x86_axor64: | |||
| 29186 | Opc = X86ISD::AXOR; | |||
| 29187 | break; | |||
| 29188 | } | |||
| 29189 | MachineMemOperand *MMO = cast<MemSDNode>(Op)->getMemOperand(); | |||
| 29190 | return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), | |||
| 29191 | {Chain, Op1, Op2}, VT, MMO); | |||
| 29192 | } | |||
| 29193 | case Intrinsic::x86_atomic_add_cc: | |||
| 29194 | case Intrinsic::x86_atomic_sub_cc: | |||
| 29195 | case Intrinsic::x86_atomic_or_cc: | |||
| 29196 | case Intrinsic::x86_atomic_and_cc: | |||
| 29197 | case Intrinsic::x86_atomic_xor_cc: { | |||
| 29198 | SDLoc DL(Op); | |||
| 29199 | SDValue Chain = Op.getOperand(0); | |||
| 29200 | SDValue Op1 = Op.getOperand(2); | |||
| 29201 | SDValue Op2 = Op.getOperand(3); | |||
| 29202 | X86::CondCode CC = (X86::CondCode)Op.getConstantOperandVal(4); | |||
| 29203 | MVT VT = Op2.getSimpleValueType(); | |||
| 29204 | unsigned Opc = 0; | |||
| 29205 | switch (IntNo) { | |||
| 29206 | default: | |||
| 29207 | llvm_unreachable("Unknown Intrinsic")::llvm::llvm_unreachable_internal("Unknown Intrinsic", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29207); | |||
| 29208 | case Intrinsic::x86_atomic_add_cc: | |||
| 29209 | Opc = X86ISD::LADD; | |||
| 29210 | break; | |||
| 29211 | case Intrinsic::x86_atomic_sub_cc: | |||
| 29212 | Opc = X86ISD::LSUB; | |||
| 29213 | break; | |||
| 29214 | case Intrinsic::x86_atomic_or_cc: | |||
| 29215 | Opc = X86ISD::LOR; | |||
| 29216 | break; | |||
| 29217 | case Intrinsic::x86_atomic_and_cc: | |||
| 29218 | Opc = X86ISD::LAND; | |||
| 29219 | break; | |||
| 29220 | case Intrinsic::x86_atomic_xor_cc: | |||
| 29221 | Opc = X86ISD::LXOR; | |||
| 29222 | break; | |||
| 29223 | } | |||
| 29224 | MachineMemOperand *MMO = cast<MemIntrinsicSDNode>(Op)->getMemOperand(); | |||
| 29225 | SDValue LockArith = | |||
| 29226 | DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::i32, MVT::Other), | |||
| 29227 | {Chain, Op1, Op2}, VT, MMO); | |||
| 29228 | Chain = LockArith.getValue(1); | |||
| 29229 | return DAG.getMergeValues({getSETCC(CC, LockArith, DL, DAG), Chain}, DL); | |||
| 29230 | } | |||
| 29231 | } | |||
| 29232 | return SDValue(); | |||
| 29233 | } | |||
| 29234 | ||||
| 29235 | SDLoc dl(Op); | |||
| 29236 | switch(IntrData->Type) { | |||
| 29237 | default: llvm_unreachable("Unknown Intrinsic Type")::llvm::llvm_unreachable_internal("Unknown Intrinsic Type", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29237); | |||
| 29238 | case RDSEED: | |||
| 29239 | case RDRAND: { | |||
| 29240 | // Emit the node with the right value type. | |||
| 29241 | SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other); | |||
| 29242 | SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); | |||
| 29243 | ||||
| 29244 | // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. | |||
| 29245 | // Otherwise return the value from Rand, which is always 0, casted to i32. | |||
| 29246 | SDValue Ops[] = {DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), | |||
| 29247 | DAG.getConstant(1, dl, Op->getValueType(1)), | |||
| 29248 | DAG.getTargetConstant(X86::COND_B, dl, MVT::i8), | |||
| 29249 | SDValue(Result.getNode(), 1)}; | |||
| 29250 | SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); | |||
| 29251 | ||||
| 29252 | // Return { result, isValid, chain }. | |||
| 29253 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, | |||
| 29254 | SDValue(Result.getNode(), 2)); | |||
| 29255 | } | |||
| 29256 | case GATHER_AVX2: { | |||
| 29257 | SDValue Chain = Op.getOperand(0); | |||
| 29258 | SDValue Src = Op.getOperand(2); | |||
| 29259 | SDValue Base = Op.getOperand(3); | |||
| 29260 | SDValue Index = Op.getOperand(4); | |||
| 29261 | SDValue Mask = Op.getOperand(5); | |||
| 29262 | SDValue Scale = Op.getOperand(6); | |||
| 29263 | return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, | |||
| 29264 | Scale, Chain, Subtarget); | |||
| 29265 | } | |||
| 29266 | case GATHER: { | |||
| 29267 | //gather(v1, mask, index, base, scale); | |||
| 29268 | SDValue Chain = Op.getOperand(0); | |||
| 29269 | SDValue Src = Op.getOperand(2); | |||
| 29270 | SDValue Base = Op.getOperand(3); | |||
| 29271 | SDValue Index = Op.getOperand(4); | |||
| 29272 | SDValue Mask = Op.getOperand(5); | |||
| 29273 | SDValue Scale = Op.getOperand(6); | |||
| 29274 | return getGatherNode(Op, DAG, Src, Mask, Base, Index, Scale, | |||
| 29275 | Chain, Subtarget); | |||
| 29276 | } | |||
| 29277 | case SCATTER: { | |||
| 29278 | //scatter(base, mask, index, v1, scale); | |||
| 29279 | SDValue Chain = Op.getOperand(0); | |||
| 29280 | SDValue Base = Op.getOperand(2); | |||
| 29281 | SDValue Mask = Op.getOperand(3); | |||
| 29282 | SDValue Index = Op.getOperand(4); | |||
| 29283 | SDValue Src = Op.getOperand(5); | |||
| 29284 | SDValue Scale = Op.getOperand(6); | |||
| 29285 | return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, | |||
| 29286 | Scale, Chain, Subtarget); | |||
| 29287 | } | |||
| 29288 | case PREFETCH: { | |||
| 29289 | const APInt &HintVal = Op.getConstantOperandAPInt(6); | |||
| 29290 | assert((HintVal == 2 || HintVal == 3) &&(static_cast <bool> ((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3") ? void (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__ __PRETTY_FUNCTION__)) | |||
| 29291 | "Wrong prefetch hint in intrinsic: should be 2 or 3")(static_cast <bool> ((HintVal == 2 || HintVal == 3) && "Wrong prefetch hint in intrinsic: should be 2 or 3") ? void (0) : __assert_fail ("(HintVal == 2 || HintVal == 3) && \"Wrong prefetch hint in intrinsic: should be 2 or 3\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29291, __extension__ __PRETTY_FUNCTION__)); | |||
| 29292 | unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0); | |||
| 29293 | SDValue Chain = Op.getOperand(0); | |||
| 29294 | SDValue Mask = Op.getOperand(2); | |||
| 29295 | SDValue Index = Op.getOperand(3); | |||
| 29296 | SDValue Base = Op.getOperand(4); | |||
| 29297 | SDValue Scale = Op.getOperand(5); | |||
| 29298 | return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain, | |||
| 29299 | Subtarget); | |||
| 29300 | } | |||
| 29301 | // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP). | |||
| 29302 | case RDTSC: { | |||
| 29303 | SmallVector<SDValue, 2> Results; | |||
| 29304 | getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget, | |||
| 29305 | Results); | |||
| 29306 | return DAG.getMergeValues(Results, dl); | |||
| 29307 | } | |||
| 29308 | // Read Performance Monitoring Counters. | |||
| 29309 | case RDPMC: | |||
| 29310 | // Read Processor Register. | |||
| 29311 | case RDPRU: | |||
| 29312 | // GetExtended Control Register. | |||
| 29313 | case XGETBV: { | |||
| 29314 | SmallVector<SDValue, 2> Results; | |||
| 29315 | ||||
| 29316 | // RDPMC uses ECX to select the index of the performance counter to read. | |||
| 29317 | // RDPRU uses ECX to select the processor register to read. | |||
| 29318 | // XGETBV uses ECX to select the index of the XCR register to return. | |||
| 29319 | // The result is stored into registers EDX:EAX. | |||
| 29320 | expandIntrinsicWChainHelper(Op.getNode(), dl, DAG, IntrData->Opc0, X86::ECX, | |||
| 29321 | Subtarget, Results); | |||
| 29322 | return DAG.getMergeValues(Results, dl); | |||
| 29323 | } | |||
| 29324 | // XTEST intrinsics. | |||
| 29325 | case XTEST: { | |||
| 29326 | SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other); | |||
| 29327 | SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); | |||
| 29328 | ||||
| 29329 | SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG); | |||
| 29330 | SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC); | |||
| 29331 | return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), | |||
| 29332 | Ret, SDValue(InTrans.getNode(), 1)); | |||
| 29333 | } | |||
| 29334 | case TRUNCATE_TO_MEM_VI8: | |||
| 29335 | case TRUNCATE_TO_MEM_VI16: | |||
| 29336 | case TRUNCATE_TO_MEM_VI32: { | |||
| 29337 | SDValue Mask = Op.getOperand(4); | |||
| 29338 | SDValue DataToTruncate = Op.getOperand(3); | |||
| 29339 | SDValue Addr = Op.getOperand(2); | |||
| 29340 | SDValue Chain = Op.getOperand(0); | |||
| 29341 | ||||
| 29342 | MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op); | |||
| 29343 | assert(MemIntr && "Expected MemIntrinsicSDNode!")(static_cast <bool> (MemIntr && "Expected MemIntrinsicSDNode!" ) ? void (0) : __assert_fail ("MemIntr && \"Expected MemIntrinsicSDNode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29343, __extension__ __PRETTY_FUNCTION__)); | |||
| 29344 | ||||
| 29345 | EVT MemVT = MemIntr->getMemoryVT(); | |||
| 29346 | ||||
| 29347 | uint16_t TruncationOp = IntrData->Opc0; | |||
| 29348 | switch (TruncationOp) { | |||
| 29349 | case X86ISD::VTRUNC: { | |||
| 29350 | if (isAllOnesConstant(Mask)) // return just a truncate store | |||
| 29351 | return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT, | |||
| 29352 | MemIntr->getMemOperand()); | |||
| 29353 | ||||
| 29354 | MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); | |||
| 29355 | SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 29356 | SDValue Offset = DAG.getUNDEF(VMask.getValueType()); | |||
| 29357 | ||||
| 29358 | return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask, | |||
| 29359 | MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED, | |||
| 29360 | true /* truncating */); | |||
| 29361 | } | |||
| 29362 | case X86ISD::VTRUNCUS: | |||
| 29363 | case X86ISD::VTRUNCS: { | |||
| 29364 | bool IsSigned = (TruncationOp == X86ISD::VTRUNCS); | |||
| 29365 | if (isAllOnesConstant(Mask)) | |||
| 29366 | return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT, | |||
| 29367 | MemIntr->getMemOperand(), DAG); | |||
| 29368 | ||||
| 29369 | MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements()); | |||
| 29370 | SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); | |||
| 29371 | ||||
| 29372 | return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, | |||
| 29373 | VMask, MemVT, MemIntr->getMemOperand(), DAG); | |||
| 29374 | } | |||
| 29375 | default: | |||
| 29376 | llvm_unreachable("Unsupported truncstore intrinsic")::llvm::llvm_unreachable_internal("Unsupported truncstore intrinsic" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29376); | |||
| 29377 | } | |||
| 29378 | } | |||
| 29379 | } | |||
| 29380 | } | |||
| 29381 | ||||
| 29382 | SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op, | |||
| 29383 | SelectionDAG &DAG) const { | |||
| 29384 | MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); | |||
| 29385 | MFI.setReturnAddressIsTaken(true); | |||
| 29386 | ||||
| 29387 | if (verifyReturnAddressArgumentIsConstant(Op, DAG)) | |||
| 29388 | return SDValue(); | |||
| 29389 | ||||
| 29390 | unsigned Depth = Op.getConstantOperandVal(0); | |||
| 29391 | SDLoc dl(Op); | |||
| 29392 | EVT PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 29393 | ||||
| 29394 | if (Depth > 0) { | |||
| 29395 | SDValue FrameAddr = LowerFRAMEADDR(Op, DAG); | |||
| 29396 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 29397 | SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT); | |||
| 29398 | return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), | |||
| 29399 | DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset), | |||
| 29400 | MachinePointerInfo()); | |||
| 29401 | } | |||
| 29402 | ||||
| 29403 | // Just load the return address. | |||
| 29404 | SDValue RetAddrFI = getReturnAddressFrameIndex(DAG); | |||
| 29405 | return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI, | |||
| 29406 | MachinePointerInfo()); | |||
| 29407 | } | |||
| 29408 | ||||
| 29409 | SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op, | |||
| 29410 | SelectionDAG &DAG) const { | |||
| 29411 | DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true); | |||
| 29412 | return getReturnAddressFrameIndex(DAG); | |||
| 29413 | } | |||
| 29414 | ||||
| 29415 | SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const { | |||
| 29416 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 29417 | MachineFrameInfo &MFI = MF.getFrameInfo(); | |||
| 29418 | X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); | |||
| 29419 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 29420 | EVT VT = Op.getValueType(); | |||
| 29421 | ||||
| 29422 | MFI.setFrameAddressIsTaken(true); | |||
| 29423 | ||||
| 29424 | if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) { | |||
| 29425 | // Depth > 0 makes no sense on targets which use Windows unwind codes. It | |||
| 29426 | // is not possible to crawl up the stack without looking at the unwind codes | |||
| 29427 | // simultaneously. | |||
| 29428 | int FrameAddrIndex = FuncInfo->getFAIndex(); | |||
| 29429 | if (!FrameAddrIndex) { | |||
| 29430 | // Set up a frame object for the return address. | |||
| 29431 | unsigned SlotSize = RegInfo->getSlotSize(); | |||
| 29432 | FrameAddrIndex = MF.getFrameInfo().CreateFixedObject( | |||
| 29433 | SlotSize, /*SPOffset=*/0, /*IsImmutable=*/false); | |||
| 29434 | FuncInfo->setFAIndex(FrameAddrIndex); | |||
| 29435 | } | |||
| 29436 | return DAG.getFrameIndex(FrameAddrIndex, VT); | |||
| 29437 | } | |||
| 29438 | ||||
| 29439 | unsigned FrameReg = | |||
| 29440 | RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction()); | |||
| 29441 | SDLoc dl(Op); // FIXME probably not meaningful | |||
| 29442 | unsigned Depth = Op.getConstantOperandVal(0); | |||
| 29443 | assert(((FrameReg == X86::RBP && VT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT:: i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__ __PRETTY_FUNCTION__)) | |||
| 29444 | (FrameReg == X86::EBP && VT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT:: i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__ __PRETTY_FUNCTION__)) | |||
| 29445 | "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT:: i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && VT == MVT::i64) || (FrameReg == X86::EBP && VT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29445, __extension__ __PRETTY_FUNCTION__)); | |||
| 29446 | SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT); | |||
| 29447 | while (Depth--) | |||
| 29448 | FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr, | |||
| 29449 | MachinePointerInfo()); | |||
| 29450 | return FrameAddr; | |||
| 29451 | } | |||
| 29452 | ||||
| 29453 | // FIXME? Maybe this could be a TableGen attribute on some registers and | |||
| 29454 | // this table could be generated automatically from RegInfo. | |||
| 29455 | Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT, | |||
| 29456 | const MachineFunction &MF) const { | |||
| 29457 | const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); | |||
| 29458 | ||||
| 29459 | Register Reg = StringSwitch<unsigned>(RegName) | |||
| 29460 | .Case("esp", X86::ESP) | |||
| 29461 | .Case("rsp", X86::RSP) | |||
| 29462 | .Case("ebp", X86::EBP) | |||
| 29463 | .Case("rbp", X86::RBP) | |||
| 29464 | .Default(0); | |||
| 29465 | ||||
| 29466 | if (Reg == X86::EBP || Reg == X86::RBP) { | |||
| 29467 | if (!TFI.hasFP(MF)) | |||
| 29468 | report_fatal_error("register " + StringRef(RegName) + | |||
| 29469 | " is allocatable: function has no frame pointer"); | |||
| 29470 | #ifndef NDEBUG | |||
| 29471 | else { | |||
| 29472 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 29473 | Register FrameReg = RegInfo->getPtrSizedFrameRegister(MF); | |||
| 29474 | assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&(static_cast <bool> ((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!") ? void (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__ __PRETTY_FUNCTION__)) | |||
| 29475 | "Invalid Frame Register!")(static_cast <bool> ((FrameReg == X86::EBP || FrameReg == X86::RBP) && "Invalid Frame Register!") ? void (0) : __assert_fail ("(FrameReg == X86::EBP || FrameReg == X86::RBP) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29475, __extension__ __PRETTY_FUNCTION__)); | |||
| 29476 | } | |||
| 29477 | #endif | |||
| 29478 | } | |||
| 29479 | ||||
| 29480 | if (Reg) | |||
| 29481 | return Reg; | |||
| 29482 | ||||
| 29483 | report_fatal_error("Invalid register name global variable"); | |||
| 29484 | } | |||
| 29485 | ||||
| 29486 | SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op, | |||
| 29487 | SelectionDAG &DAG) const { | |||
| 29488 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 29489 | return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op)); | |||
| 29490 | } | |||
| 29491 | ||||
| 29492 | Register X86TargetLowering::getExceptionPointerRegister( | |||
| 29493 | const Constant *PersonalityFn) const { | |||
| 29494 | if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR) | |||
| 29495 | return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; | |||
| 29496 | ||||
| 29497 | return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX; | |||
| 29498 | } | |||
| 29499 | ||||
| 29500 | Register X86TargetLowering::getExceptionSelectorRegister( | |||
| 29501 | const Constant *PersonalityFn) const { | |||
| 29502 | // Funclet personalities don't use selectors (the runtime does the selection). | |||
| 29503 | if (isFuncletEHPersonality(classifyEHPersonality(PersonalityFn))) | |||
| 29504 | return X86::NoRegister; | |||
| 29505 | return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX; | |||
| 29506 | } | |||
| 29507 | ||||
| 29508 | bool X86TargetLowering::needsFixedCatchObjects() const { | |||
| 29509 | return Subtarget.isTargetWin64(); | |||
| 29510 | } | |||
| 29511 | ||||
| 29512 | SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const { | |||
| 29513 | SDValue Chain = Op.getOperand(0); | |||
| 29514 | SDValue Offset = Op.getOperand(1); | |||
| 29515 | SDValue Handler = Op.getOperand(2); | |||
| 29516 | SDLoc dl (Op); | |||
| 29517 | ||||
| 29518 | EVT PtrVT = getPointerTy(DAG.getDataLayout()); | |||
| 29519 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 29520 | Register FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction()); | |||
| 29521 | assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||(static_cast <bool> (((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT ::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__ __PRETTY_FUNCTION__)) | |||
| 29522 | (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&(static_cast <bool> (((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT ::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__ __PRETTY_FUNCTION__)) | |||
| 29523 | "Invalid Frame Register!")(static_cast <bool> (((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT ::i32)) && "Invalid Frame Register!") ? void (0) : __assert_fail ("((FrameReg == X86::RBP && PtrVT == MVT::i64) || (FrameReg == X86::EBP && PtrVT == MVT::i32)) && \"Invalid Frame Register!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29523, __extension__ __PRETTY_FUNCTION__)); | |||
| 29524 | SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT); | |||
| 29525 | Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX; | |||
| 29526 | ||||
| 29527 | SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame, | |||
| 29528 | DAG.getIntPtrConstant(RegInfo->getSlotSize(), | |||
| 29529 | dl)); | |||
| 29530 | StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset); | |||
| 29531 | Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo()); | |||
| 29532 | Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr); | |||
| 29533 | ||||
| 29534 | return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain, | |||
| 29535 | DAG.getRegister(StoreAddrReg, PtrVT)); | |||
| 29536 | } | |||
| 29537 | ||||
| 29538 | SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op, | |||
| 29539 | SelectionDAG &DAG) const { | |||
| 29540 | SDLoc DL(Op); | |||
| 29541 | // If the subtarget is not 64bit, we may need the global base reg | |||
| 29542 | // after isel expand pseudo, i.e., after CGBR pass ran. | |||
| 29543 | // Therefore, ask for the GlobalBaseReg now, so that the pass | |||
| 29544 | // inserts the code for us in case we need it. | |||
| 29545 | // Otherwise, we will end up in a situation where we will | |||
| 29546 | // reference a virtual register that is not defined! | |||
| 29547 | if (!Subtarget.is64Bit()) { | |||
| 29548 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 29549 | (void)TII->getGlobalBaseReg(&DAG.getMachineFunction()); | |||
| 29550 | } | |||
| 29551 | return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL, | |||
| 29552 | DAG.getVTList(MVT::i32, MVT::Other), | |||
| 29553 | Op.getOperand(0), Op.getOperand(1)); | |||
| 29554 | } | |||
| 29555 | ||||
| 29556 | SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op, | |||
| 29557 | SelectionDAG &DAG) const { | |||
| 29558 | SDLoc DL(Op); | |||
| 29559 | return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other, | |||
| 29560 | Op.getOperand(0), Op.getOperand(1)); | |||
| 29561 | } | |||
| 29562 | ||||
| 29563 | SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op, | |||
| 29564 | SelectionDAG &DAG) const { | |||
| 29565 | SDLoc DL(Op); | |||
| 29566 | return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other, | |||
| 29567 | Op.getOperand(0)); | |||
| 29568 | } | |||
| 29569 | ||||
| 29570 | static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) { | |||
| 29571 | return Op.getOperand(0); | |||
| 29572 | } | |||
| 29573 | ||||
| 29574 | SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, | |||
| 29575 | SelectionDAG &DAG) const { | |||
| 29576 | SDValue Root = Op.getOperand(0); | |||
| 29577 | SDValue Trmp = Op.getOperand(1); // trampoline | |||
| 29578 | SDValue FPtr = Op.getOperand(2); // nested function | |||
| 29579 | SDValue Nest = Op.getOperand(3); // 'nest' parameter value | |||
| 29580 | SDLoc dl (Op); | |||
| 29581 | ||||
| 29582 | const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); | |||
| 29583 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 29584 | ||||
| 29585 | if (Subtarget.is64Bit()) { | |||
| 29586 | SDValue OutChains[6]; | |||
| 29587 | ||||
| 29588 | // Large code-model. | |||
| 29589 | const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode. | |||
| 29590 | const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode. | |||
| 29591 | ||||
| 29592 | const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7; | |||
| 29593 | const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7; | |||
| 29594 | ||||
| 29595 | const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix | |||
| 29596 | ||||
| 29597 | // Load the pointer to the nested function into R11. | |||
| 29598 | unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11 | |||
| 29599 | SDValue Addr = Trmp; | |||
| 29600 | OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), | |||
| 29601 | Addr, MachinePointerInfo(TrmpAddr)); | |||
| 29602 | ||||
| 29603 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, | |||
| 29604 | DAG.getConstant(2, dl, MVT::i64)); | |||
| 29605 | OutChains[1] = DAG.getStore(Root, dl, FPtr, Addr, | |||
| 29606 | MachinePointerInfo(TrmpAddr, 2), Align(2)); | |||
| 29607 | ||||
| 29608 | // Load the 'nest' parameter value into R10. | |||
| 29609 | // R10 is specified in X86CallingConv.td | |||
| 29610 | OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10 | |||
| 29611 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, | |||
| 29612 | DAG.getConstant(10, dl, MVT::i64)); | |||
| 29613 | OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), | |||
| 29614 | Addr, MachinePointerInfo(TrmpAddr, 10)); | |||
| 29615 | ||||
| 29616 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, | |||
| 29617 | DAG.getConstant(12, dl, MVT::i64)); | |||
| 29618 | OutChains[3] = DAG.getStore(Root, dl, Nest, Addr, | |||
| 29619 | MachinePointerInfo(TrmpAddr, 12), Align(2)); | |||
| 29620 | ||||
| 29621 | // Jump to the nested function. | |||
| 29622 | OpCode = (JMP64r << 8) | REX_WB; // jmpq *... | |||
| 29623 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, | |||
| 29624 | DAG.getConstant(20, dl, MVT::i64)); | |||
| 29625 | OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16), | |||
| 29626 | Addr, MachinePointerInfo(TrmpAddr, 20)); | |||
| 29627 | ||||
| 29628 | unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11 | |||
| 29629 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, | |||
| 29630 | DAG.getConstant(22, dl, MVT::i64)); | |||
| 29631 | OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8), | |||
| 29632 | Addr, MachinePointerInfo(TrmpAddr, 22)); | |||
| 29633 | ||||
| 29634 | return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); | |||
| 29635 | } else { | |||
| 29636 | const Function *Func = | |||
| 29637 | cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue()); | |||
| 29638 | CallingConv::ID CC = Func->getCallingConv(); | |||
| 29639 | unsigned NestReg; | |||
| 29640 | ||||
| 29641 | switch (CC) { | |||
| 29642 | default: | |||
| 29643 | llvm_unreachable("Unsupported calling convention")::llvm::llvm_unreachable_internal("Unsupported calling convention" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29643); | |||
| 29644 | case CallingConv::C: | |||
| 29645 | case CallingConv::X86_StdCall: { | |||
| 29646 | // Pass 'nest' parameter in ECX. | |||
| 29647 | // Must be kept in sync with X86CallingConv.td | |||
| 29648 | NestReg = X86::ECX; | |||
| 29649 | ||||
| 29650 | // Check that ECX wasn't needed by an 'inreg' parameter. | |||
| 29651 | FunctionType *FTy = Func->getFunctionType(); | |||
| 29652 | const AttributeList &Attrs = Func->getAttributes(); | |||
| 29653 | ||||
| 29654 | if (!Attrs.isEmpty() && !Func->isVarArg()) { | |||
| 29655 | unsigned InRegCount = 0; | |||
| 29656 | unsigned Idx = 0; | |||
| 29657 | ||||
| 29658 | for (FunctionType::param_iterator I = FTy->param_begin(), | |||
| 29659 | E = FTy->param_end(); I != E; ++I, ++Idx) | |||
| 29660 | if (Attrs.hasParamAttr(Idx, Attribute::InReg)) { | |||
| 29661 | const DataLayout &DL = DAG.getDataLayout(); | |||
| 29662 | // FIXME: should only count parameters that are lowered to integers. | |||
| 29663 | InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32; | |||
| 29664 | } | |||
| 29665 | ||||
| 29666 | if (InRegCount > 2) { | |||
| 29667 | report_fatal_error("Nest register in use - reduce number of inreg" | |||
| 29668 | " parameters!"); | |||
| 29669 | } | |||
| 29670 | } | |||
| 29671 | break; | |||
| 29672 | } | |||
| 29673 | case CallingConv::X86_FastCall: | |||
| 29674 | case CallingConv::X86_ThisCall: | |||
| 29675 | case CallingConv::Fast: | |||
| 29676 | case CallingConv::Tail: | |||
| 29677 | case CallingConv::SwiftTail: | |||
| 29678 | // Pass 'nest' parameter in EAX. | |||
| 29679 | // Must be kept in sync with X86CallingConv.td | |||
| 29680 | NestReg = X86::EAX; | |||
| 29681 | break; | |||
| 29682 | } | |||
| 29683 | ||||
| 29684 | SDValue OutChains[4]; | |||
| 29685 | SDValue Addr, Disp; | |||
| 29686 | ||||
| 29687 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, | |||
| 29688 | DAG.getConstant(10, dl, MVT::i32)); | |||
| 29689 | Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr); | |||
| 29690 | ||||
| 29691 | // This is storing the opcode for MOV32ri. | |||
| 29692 | const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte. | |||
| 29693 | const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7; | |||
| 29694 | OutChains[0] = | |||
| 29695 | DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8), | |||
| 29696 | Trmp, MachinePointerInfo(TrmpAddr)); | |||
| 29697 | ||||
| 29698 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, | |||
| 29699 | DAG.getConstant(1, dl, MVT::i32)); | |||
| 29700 | OutChains[1] = DAG.getStore(Root, dl, Nest, Addr, | |||
| 29701 | MachinePointerInfo(TrmpAddr, 1), Align(1)); | |||
| 29702 | ||||
| 29703 | const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode. | |||
| 29704 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, | |||
| 29705 | DAG.getConstant(5, dl, MVT::i32)); | |||
| 29706 | OutChains[2] = | |||
| 29707 | DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8), Addr, | |||
| 29708 | MachinePointerInfo(TrmpAddr, 5), Align(1)); | |||
| 29709 | ||||
| 29710 | Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp, | |||
| 29711 | DAG.getConstant(6, dl, MVT::i32)); | |||
| 29712 | OutChains[3] = DAG.getStore(Root, dl, Disp, Addr, | |||
| 29713 | MachinePointerInfo(TrmpAddr, 6), Align(1)); | |||
| 29714 | ||||
| 29715 | return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); | |||
| 29716 | } | |||
| 29717 | } | |||
| 29718 | ||||
| 29719 | SDValue X86TargetLowering::LowerGET_ROUNDING(SDValue Op, | |||
| 29720 | SelectionDAG &DAG) const { | |||
| 29721 | /* | |||
| 29722 | The rounding mode is in bits 11:10 of FPSR, and has the following | |||
| 29723 | settings: | |||
| 29724 | 00 Round to nearest | |||
| 29725 | 01 Round to -inf | |||
| 29726 | 10 Round to +inf | |||
| 29727 | 11 Round to 0 | |||
| 29728 | ||||
| 29729 | GET_ROUNDING, on the other hand, expects the following: | |||
| 29730 | -1 Undefined | |||
| 29731 | 0 Round to 0 | |||
| 29732 | 1 Round to nearest | |||
| 29733 | 2 Round to +inf | |||
| 29734 | 3 Round to -inf | |||
| 29735 | ||||
| 29736 | To perform the conversion, we use a packed lookup table of the four 2-bit | |||
| 29737 | values that we can index by FPSP[11:10] | |||
| 29738 | 0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10] | |||
| 29739 | ||||
| 29740 | (0x2d >> ((FPSR & 0xc00) >> 9)) & 3 | |||
| 29741 | */ | |||
| 29742 | ||||
| 29743 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 29744 | MVT VT = Op.getSimpleValueType(); | |||
| 29745 | SDLoc DL(Op); | |||
| 29746 | ||||
| 29747 | // Save FP Control Word to stack slot | |||
| 29748 | int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false); | |||
| 29749 | SDValue StackSlot = | |||
| 29750 | DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout())); | |||
| 29751 | ||||
| 29752 | MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI); | |||
| 29753 | ||||
| 29754 | SDValue Chain = Op.getOperand(0); | |||
| 29755 | SDValue Ops[] = {Chain, StackSlot}; | |||
| 29756 | Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL, | |||
| 29757 | DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI, | |||
| 29758 | Align(2), MachineMemOperand::MOStore); | |||
| 29759 | ||||
| 29760 | // Load FP Control Word from stack slot | |||
| 29761 | SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2)); | |||
| 29762 | Chain = CWD.getValue(1); | |||
| 29763 | ||||
| 29764 | // Mask and turn the control bits into a shift for the lookup table. | |||
| 29765 | SDValue Shift = | |||
| 29766 | DAG.getNode(ISD::SRL, DL, MVT::i16, | |||
| 29767 | DAG.getNode(ISD::AND, DL, MVT::i16, | |||
| 29768 | CWD, DAG.getConstant(0xc00, DL, MVT::i16)), | |||
| 29769 | DAG.getConstant(9, DL, MVT::i8)); | |||
| 29770 | Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift); | |||
| 29771 | ||||
| 29772 | SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32); | |||
| 29773 | SDValue RetVal = | |||
| 29774 | DAG.getNode(ISD::AND, DL, MVT::i32, | |||
| 29775 | DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift), | |||
| 29776 | DAG.getConstant(3, DL, MVT::i32)); | |||
| 29777 | ||||
| 29778 | RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT); | |||
| 29779 | ||||
| 29780 | return DAG.getMergeValues({RetVal, Chain}, DL); | |||
| 29781 | } | |||
| 29782 | ||||
| 29783 | SDValue X86TargetLowering::LowerSET_ROUNDING(SDValue Op, | |||
| 29784 | SelectionDAG &DAG) const { | |||
| 29785 | MachineFunction &MF = DAG.getMachineFunction(); | |||
| 29786 | SDLoc DL(Op); | |||
| 29787 | SDValue Chain = Op.getNode()->getOperand(0); | |||
| 29788 | ||||
| 29789 | // FP control word may be set only from data in memory. So we need to allocate | |||
| 29790 | // stack space to save/load FP control word. | |||
| 29791 | int OldCWFrameIdx = MF.getFrameInfo().CreateStackObject(4, Align(4), false); | |||
| 29792 | SDValue StackSlot = | |||
| 29793 | DAG.getFrameIndex(OldCWFrameIdx, getPointerTy(DAG.getDataLayout())); | |||
| 29794 | MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, OldCWFrameIdx); | |||
| 29795 | MachineMemOperand *MMO = | |||
| 29796 | MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 2, Align(2)); | |||
| 29797 | ||||
| 29798 | // Store FP control word into memory. | |||
| 29799 | SDValue Ops[] = {Chain, StackSlot}; | |||
| 29800 | Chain = DAG.getMemIntrinsicNode( | |||
| 29801 | X86ISD::FNSTCW16m, DL, DAG.getVTList(MVT::Other), Ops, MVT::i16, MMO); | |||
| 29802 | ||||
| 29803 | // Load FP Control Word from stack slot and clear RM field (bits 11:10). | |||
| 29804 | SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI); | |||
| 29805 | Chain = CWD.getValue(1); | |||
| 29806 | CWD = DAG.getNode(ISD::AND, DL, MVT::i16, CWD.getValue(0), | |||
| 29807 | DAG.getConstant(0xf3ff, DL, MVT::i16)); | |||
| 29808 | ||||
| 29809 | // Calculate new rounding mode. | |||
| 29810 | SDValue NewRM = Op.getNode()->getOperand(1); | |||
| 29811 | SDValue RMBits; | |||
| 29812 | if (auto *CVal = dyn_cast<ConstantSDNode>(NewRM)) { | |||
| 29813 | uint64_t RM = CVal->getZExtValue(); | |||
| 29814 | int FieldVal; | |||
| 29815 | switch (static_cast<RoundingMode>(RM)) { | |||
| 29816 | case RoundingMode::NearestTiesToEven: FieldVal = X86::rmToNearest; break; | |||
| 29817 | case RoundingMode::TowardNegative: FieldVal = X86::rmDownward; break; | |||
| 29818 | case RoundingMode::TowardPositive: FieldVal = X86::rmUpward; break; | |||
| 29819 | case RoundingMode::TowardZero: FieldVal = X86::rmTowardZero; break; | |||
| 29820 | default: | |||
| 29821 | llvm_unreachable("rounding mode is not supported by X86 hardware")::llvm::llvm_unreachable_internal("rounding mode is not supported by X86 hardware" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29821); | |||
| 29822 | } | |||
| 29823 | RMBits = DAG.getConstant(FieldVal, DL, MVT::i16); | |||
| 29824 | } else { | |||
| 29825 | // Need to convert argument into bits of control word: | |||
| 29826 | // 0 Round to 0 -> 11 | |||
| 29827 | // 1 Round to nearest -> 00 | |||
| 29828 | // 2 Round to +inf -> 10 | |||
| 29829 | // 3 Round to -inf -> 01 | |||
| 29830 | // The 2-bit value needs then to be shifted so that it occupies bits 11:10. | |||
| 29831 | // To make the conversion, put all these values into a value 0xc9 and shift | |||
| 29832 | // it left depending on the rounding mode: | |||
| 29833 | // (0xc9 << 4) & 0xc00 = X86::rmTowardZero | |||
| 29834 | // (0xc9 << 6) & 0xc00 = X86::rmToNearest | |||
| 29835 | // ... | |||
| 29836 | // (0xc9 << (2 * NewRM + 4)) & 0xc00 | |||
| 29837 | SDValue ShiftValue = | |||
| 29838 | DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, | |||
| 29839 | DAG.getNode(ISD::ADD, DL, MVT::i32, | |||
| 29840 | DAG.getNode(ISD::SHL, DL, MVT::i32, NewRM, | |||
| 29841 | DAG.getConstant(1, DL, MVT::i8)), | |||
| 29842 | DAG.getConstant(4, DL, MVT::i32))); | |||
| 29843 | SDValue Shifted = | |||
| 29844 | DAG.getNode(ISD::SHL, DL, MVT::i16, DAG.getConstant(0xc9, DL, MVT::i16), | |||
| 29845 | ShiftValue); | |||
| 29846 | RMBits = DAG.getNode(ISD::AND, DL, MVT::i16, Shifted, | |||
| 29847 | DAG.getConstant(0xc00, DL, MVT::i16)); | |||
| 29848 | } | |||
| 29849 | ||||
| 29850 | // Update rounding mode bits and store the new FP Control Word into stack. | |||
| 29851 | CWD = DAG.getNode(ISD::OR, DL, MVT::i16, CWD, RMBits); | |||
| 29852 | Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(2)); | |||
| 29853 | ||||
| 29854 | // Load FP control word from the slot. | |||
| 29855 | SDValue OpsLD[] = {Chain, StackSlot}; | |||
| 29856 | MachineMemOperand *MMOL = | |||
| 29857 | MF.getMachineMemOperand(MPI, MachineMemOperand::MOLoad, 2, Align(2)); | |||
| 29858 | Chain = DAG.getMemIntrinsicNode( | |||
| 29859 | X86ISD::FLDCW16m, DL, DAG.getVTList(MVT::Other), OpsLD, MVT::i16, MMOL); | |||
| 29860 | ||||
| 29861 | // If target supports SSE, set MXCSR as well. Rounding mode is encoded in the | |||
| 29862 | // same way but in bits 14:13. | |||
| 29863 | if (Subtarget.hasSSE1()) { | |||
| 29864 | // Store MXCSR into memory. | |||
| 29865 | Chain = DAG.getNode( | |||
| 29866 | ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, | |||
| 29867 | DAG.getTargetConstant(Intrinsic::x86_sse_stmxcsr, DL, MVT::i32), | |||
| 29868 | StackSlot); | |||
| 29869 | ||||
| 29870 | // Load MXCSR from stack slot and clear RM field (bits 14:13). | |||
| 29871 | SDValue CWD = DAG.getLoad(MVT::i32, DL, Chain, StackSlot, MPI); | |||
| 29872 | Chain = CWD.getValue(1); | |||
| 29873 | CWD = DAG.getNode(ISD::AND, DL, MVT::i32, CWD.getValue(0), | |||
| 29874 | DAG.getConstant(0xffff9fff, DL, MVT::i32)); | |||
| 29875 | ||||
| 29876 | // Shift X87 RM bits from 11:10 to 14:13. | |||
| 29877 | RMBits = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, RMBits); | |||
| 29878 | RMBits = DAG.getNode(ISD::SHL, DL, MVT::i32, RMBits, | |||
| 29879 | DAG.getConstant(3, DL, MVT::i8)); | |||
| 29880 | ||||
| 29881 | // Update rounding mode bits and store the new FP Control Word into stack. | |||
| 29882 | CWD = DAG.getNode(ISD::OR, DL, MVT::i32, CWD, RMBits); | |||
| 29883 | Chain = DAG.getStore(Chain, DL, CWD, StackSlot, MPI, Align(4)); | |||
| 29884 | ||||
| 29885 | // Load MXCSR from the slot. | |||
| 29886 | Chain = DAG.getNode( | |||
| 29887 | ISD::INTRINSIC_VOID, DL, DAG.getVTList(MVT::Other), Chain, | |||
| 29888 | DAG.getTargetConstant(Intrinsic::x86_sse_ldmxcsr, DL, MVT::i32), | |||
| 29889 | StackSlot); | |||
| 29890 | } | |||
| 29891 | ||||
| 29892 | return Chain; | |||
| 29893 | } | |||
| 29894 | ||||
| 29895 | /// Lower a vector CTLZ using native supported vector CTLZ instruction. | |||
| 29896 | // | |||
| 29897 | // i8/i16 vector implemented using dword LZCNT vector instruction | |||
| 29898 | // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal, | |||
| 29899 | // split the vector, perform operation on it's Lo a Hi part and | |||
| 29900 | // concatenate the results. | |||
| 29901 | static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG, | |||
| 29902 | const X86Subtarget &Subtarget) { | |||
| 29903 | assert(Op.getOpcode() == ISD::CTLZ)(static_cast <bool> (Op.getOpcode() == ISD::CTLZ) ? void (0) : __assert_fail ("Op.getOpcode() == ISD::CTLZ", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 29903, __extension__ __PRETTY_FUNCTION__)); | |||
| 29904 | SDLoc dl(Op); | |||
| 29905 | MVT VT = Op.getSimpleValueType(); | |||
| 29906 | MVT EltVT = VT.getVectorElementType(); | |||
| 29907 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 29908 | ||||
| 29909 | assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT:: i16) && "Unsupported element type") ? void (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__ __PRETTY_FUNCTION__)) | |||
| 29910 | "Unsupported element type")(static_cast <bool> ((EltVT == MVT::i8 || EltVT == MVT:: i16) && "Unsupported element type") ? void (0) : __assert_fail ("(EltVT == MVT::i8 || EltVT == MVT::i16) && \"Unsupported element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29910, __extension__ __PRETTY_FUNCTION__)); | |||
| 29911 | ||||
| 29912 | // Split vector, it's Lo and Hi parts will be handled in next iteration. | |||
| 29913 | if (NumElems > 16 || | |||
| 29914 | (NumElems == 16 && !Subtarget.canExtendTo512DQ())) | |||
| 29915 | return splitVectorIntUnary(Op, DAG); | |||
| 29916 | ||||
| 29917 | MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); | |||
| 29918 | assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector ()) && "Unsupported value type for operation") ? void (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__ __PRETTY_FUNCTION__)) | |||
| 29919 | "Unsupported value type for operation")(static_cast <bool> ((NewVT.is256BitVector() || NewVT.is512BitVector ()) && "Unsupported value type for operation") ? void (0) : __assert_fail ("(NewVT.is256BitVector() || NewVT.is512BitVector()) && \"Unsupported value type for operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 29919, __extension__ __PRETTY_FUNCTION__)); | |||
| 29920 | ||||
| 29921 | // Use native supported vector instruction vplzcntd. | |||
| 29922 | Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0)); | |||
| 29923 | SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op); | |||
| 29924 | SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode); | |||
| 29925 | SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT); | |||
| 29926 | ||||
| 29927 | return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta); | |||
| 29928 | } | |||
| 29929 | ||||
| 29930 | // Lower CTLZ using a PSHUFB lookup table implementation. | |||
| 29931 | static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, | |||
| 29932 | const X86Subtarget &Subtarget, | |||
| 29933 | SelectionDAG &DAG) { | |||
| 29934 | MVT VT = Op.getSimpleValueType(); | |||
| 29935 | int NumElts = VT.getVectorNumElements(); | |||
| 29936 | int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8); | |||
| 29937 | MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes); | |||
| 29938 | ||||
| 29939 | // Per-nibble leading zero PSHUFB lookup table. | |||
| 29940 | const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2, | |||
| 29941 | /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1, | |||
| 29942 | /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0, | |||
| 29943 | /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0}; | |||
| 29944 | ||||
| 29945 | SmallVector<SDValue, 64> LUTVec; | |||
| 29946 | for (int i = 0; i < NumBytes; ++i) | |||
| 29947 | LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); | |||
| 29948 | SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec); | |||
| 29949 | ||||
| 29950 | // Begin by bitcasting the input to byte vector, then split those bytes | |||
| 29951 | // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them. | |||
| 29952 | // If the hi input nibble is zero then we add both results together, otherwise | |||
| 29953 | // we just take the hi result (by masking the lo result to zero before the | |||
| 29954 | // add). | |||
| 29955 | SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0)); | |||
| 29956 | SDValue Zero = DAG.getConstant(0, DL, CurrVT); | |||
| 29957 | ||||
| 29958 | SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); | |||
| 29959 | SDValue Lo = Op0; | |||
| 29960 | SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); | |||
| 29961 | SDValue HiZ; | |||
| 29962 | if (CurrVT.is512BitVector()) { | |||
| 29963 | MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); | |||
| 29964 | HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ); | |||
| 29965 | HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); | |||
| 29966 | } else { | |||
| 29967 | HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); | |||
| 29968 | } | |||
| 29969 | ||||
| 29970 | Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); | |||
| 29971 | Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); | |||
| 29972 | Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ); | |||
| 29973 | SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi); | |||
| 29974 | ||||
| 29975 | // Merge result back from vXi8 back to VT, working on the lo/hi halves | |||
| 29976 | // of the current vector width in the same way we did for the nibbles. | |||
| 29977 | // If the upper half of the input element is zero then add the halves' | |||
| 29978 | // leading zero counts together, otherwise just use the upper half's. | |||
| 29979 | // Double the width of the result until we are at target width. | |||
| 29980 | while (CurrVT != VT) { | |||
| 29981 | int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits(); | |||
| 29982 | int CurrNumElts = CurrVT.getVectorNumElements(); | |||
| 29983 | MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2); | |||
| 29984 | MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2); | |||
| 29985 | SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); | |||
| 29986 | ||||
| 29987 | // Check if the upper half of the input element is zero. | |||
| 29988 | if (CurrVT.is512BitVector()) { | |||
| 29989 | MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); | |||
| 29990 | HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0), | |||
| 29991 | DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); | |||
| 29992 | HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); | |||
| 29993 | } else { | |||
| 29994 | HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), | |||
| 29995 | DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); | |||
| 29996 | } | |||
| 29997 | HiZ = DAG.getBitcast(NextVT, HiZ); | |||
| 29998 | ||||
| 29999 | // Move the upper/lower halves to the lower bits as we'll be extending to | |||
| 30000 | // NextVT. Mask the lower result to zero if HiZ is true and add the results | |||
| 30001 | // together. | |||
| 30002 | SDValue ResNext = Res = DAG.getBitcast(NextVT, Res); | |||
| 30003 | SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift); | |||
| 30004 | SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift); | |||
| 30005 | R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1); | |||
| 30006 | Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1); | |||
| 30007 | CurrVT = NextVT; | |||
| 30008 | } | |||
| 30009 | ||||
| 30010 | return Res; | |||
| 30011 | } | |||
| 30012 | ||||
| 30013 | static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL, | |||
| 30014 | const X86Subtarget &Subtarget, | |||
| 30015 | SelectionDAG &DAG) { | |||
| 30016 | MVT VT = Op.getSimpleValueType(); | |||
| 30017 | ||||
| 30018 | if (Subtarget.hasCDI() && | |||
| 30019 | // vXi8 vectors need to be promoted to 512-bits for vXi32. | |||
| 30020 | (Subtarget.canExtendTo512DQ() || VT.getVectorElementType() != MVT::i8)) | |||
| 30021 | return LowerVectorCTLZ_AVX512CDI(Op, DAG, Subtarget); | |||
| 30022 | ||||
| 30023 | // Decompose 256-bit ops into smaller 128-bit ops. | |||
| 30024 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30025 | return splitVectorIntUnary(Op, DAG); | |||
| 30026 | ||||
| 30027 | // Decompose 512-bit ops into smaller 256-bit ops. | |||
| 30028 | if (VT.is512BitVector() && !Subtarget.hasBWI()) | |||
| 30029 | return splitVectorIntUnary(Op, DAG); | |||
| 30030 | ||||
| 30031 | assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB")(static_cast <bool> (Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB" ) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"Expected SSSE3 support for PSHUFB\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30031, __extension__ __PRETTY_FUNCTION__)); | |||
| 30032 | return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG); | |||
| 30033 | } | |||
| 30034 | ||||
| 30035 | static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30036 | SelectionDAG &DAG) { | |||
| 30037 | MVT VT = Op.getSimpleValueType(); | |||
| 30038 | MVT OpVT = VT; | |||
| 30039 | unsigned NumBits = VT.getSizeInBits(); | |||
| 30040 | SDLoc dl(Op); | |||
| 30041 | unsigned Opc = Op.getOpcode(); | |||
| 30042 | ||||
| 30043 | if (VT.isVector()) | |||
| 30044 | return LowerVectorCTLZ(Op, dl, Subtarget, DAG); | |||
| 30045 | ||||
| 30046 | Op = Op.getOperand(0); | |||
| 30047 | if (VT == MVT::i8) { | |||
| 30048 | // Zero extend to i32 since there is not an i8 bsr. | |||
| 30049 | OpVT = MVT::i32; | |||
| 30050 | Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op); | |||
| 30051 | } | |||
| 30052 | ||||
| 30053 | // Issue a bsr (scan bits in reverse) which also sets EFLAGS. | |||
| 30054 | SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); | |||
| 30055 | Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op); | |||
| 30056 | ||||
| 30057 | if (Opc == ISD::CTLZ) { | |||
| 30058 | // If src is zero (i.e. bsr sets ZF), returns NumBits. | |||
| 30059 | SDValue Ops[] = {Op, DAG.getConstant(NumBits + NumBits - 1, dl, OpVT), | |||
| 30060 | DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), | |||
| 30061 | Op.getValue(1)}; | |||
| 30062 | Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops); | |||
| 30063 | } | |||
| 30064 | ||||
| 30065 | // Finally xor with NumBits-1. | |||
| 30066 | Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, | |||
| 30067 | DAG.getConstant(NumBits - 1, dl, OpVT)); | |||
| 30068 | ||||
| 30069 | if (VT == MVT::i8) | |||
| 30070 | Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op); | |||
| 30071 | return Op; | |||
| 30072 | } | |||
| 30073 | ||||
| 30074 | static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30075 | SelectionDAG &DAG) { | |||
| 30076 | MVT VT = Op.getSimpleValueType(); | |||
| 30077 | unsigned NumBits = VT.getScalarSizeInBits(); | |||
| 30078 | SDValue N0 = Op.getOperand(0); | |||
| 30079 | SDLoc dl(Op); | |||
| 30080 | ||||
| 30081 | assert(!VT.isVector() && Op.getOpcode() == ISD::CTTZ &&(static_cast <bool> (!VT.isVector() && Op.getOpcode () == ISD::CTTZ && "Only scalar CTTZ requires custom lowering" ) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__ __PRETTY_FUNCTION__)) | |||
| 30082 | "Only scalar CTTZ requires custom lowering")(static_cast <bool> (!VT.isVector() && Op.getOpcode () == ISD::CTTZ && "Only scalar CTTZ requires custom lowering" ) ? void (0) : __assert_fail ("!VT.isVector() && Op.getOpcode() == ISD::CTTZ && \"Only scalar CTTZ requires custom lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30082, __extension__ __PRETTY_FUNCTION__)); | |||
| 30083 | ||||
| 30084 | // Issue a bsf (scan bits forward) which also sets EFLAGS. | |||
| 30085 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 30086 | Op = DAG.getNode(X86ISD::BSF, dl, VTs, N0); | |||
| 30087 | ||||
| 30088 | // If src is known never zero we can skip the CMOV. | |||
| 30089 | if (DAG.isKnownNeverZero(N0)) | |||
| 30090 | return Op; | |||
| 30091 | ||||
| 30092 | // If src is zero (i.e. bsf sets ZF), returns NumBits. | |||
| 30093 | SDValue Ops[] = {Op, DAG.getConstant(NumBits, dl, VT), | |||
| 30094 | DAG.getTargetConstant(X86::COND_E, dl, MVT::i8), | |||
| 30095 | Op.getValue(1)}; | |||
| 30096 | return DAG.getNode(X86ISD::CMOV, dl, VT, Ops); | |||
| 30097 | } | |||
| 30098 | ||||
| 30099 | static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG, | |||
| 30100 | const X86Subtarget &Subtarget) { | |||
| 30101 | MVT VT = Op.getSimpleValueType(); | |||
| 30102 | if (VT == MVT::i16 || VT == MVT::i32) | |||
| 30103 | return lowerAddSubToHorizontalOp(Op, DAG, Subtarget); | |||
| 30104 | ||||
| 30105 | if (VT == MVT::v32i16 || VT == MVT::v64i8) | |||
| 30106 | return splitVectorIntBinary(Op, DAG); | |||
| 30107 | ||||
| 30108 | assert(Op.getSimpleValueType().is256BitVector() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector () && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__ __PRETTY_FUNCTION__)) | |||
| 30109 | Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().is256BitVector () && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__ __PRETTY_FUNCTION__)) | |||
| 30110 | "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (Op.getSimpleValueType().is256BitVector () && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? void (0) : __assert_fail ("Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30110, __extension__ __PRETTY_FUNCTION__)); | |||
| 30111 | return splitVectorIntBinary(Op, DAG); | |||
| 30112 | } | |||
| 30113 | ||||
| 30114 | static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG, | |||
| 30115 | const X86Subtarget &Subtarget) { | |||
| 30116 | MVT VT = Op.getSimpleValueType(); | |||
| 30117 | SDValue X = Op.getOperand(0), Y = Op.getOperand(1); | |||
| 30118 | unsigned Opcode = Op.getOpcode(); | |||
| 30119 | SDLoc DL(Op); | |||
| 30120 | ||||
| 30121 | if (VT == MVT::v32i16 || VT == MVT::v64i8 || | |||
| 30122 | (VT.is256BitVector() && !Subtarget.hasInt256())) { | |||
| 30123 | assert(Op.getSimpleValueType().isInteger() &&(static_cast <bool> (Op.getSimpleValueType().isInteger( ) && "Only handle AVX vector integer operation") ? void (0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__ __PRETTY_FUNCTION__)) | |||
| 30124 | "Only handle AVX vector integer operation")(static_cast <bool> (Op.getSimpleValueType().isInteger( ) && "Only handle AVX vector integer operation") ? void (0) : __assert_fail ("Op.getSimpleValueType().isInteger() && \"Only handle AVX vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30124, __extension__ __PRETTY_FUNCTION__)); | |||
| 30125 | return splitVectorIntBinary(Op, DAG); | |||
| 30126 | } | |||
| 30127 | ||||
| 30128 | // Avoid the generic expansion with min/max if we don't have pminu*/pmaxu*. | |||
| 30129 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 30130 | EVT SetCCResultType = | |||
| 30131 | TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); | |||
| 30132 | ||||
| 30133 | unsigned BitWidth = VT.getScalarSizeInBits(); | |||
| 30134 | if (Opcode == ISD::USUBSAT) { | |||
| 30135 | if (!TLI.isOperationLegal(ISD::UMAX, VT) || useVPTERNLOG(Subtarget, VT)) { | |||
| 30136 | // Handle a special-case with a bit-hack instead of cmp+select: | |||
| 30137 | // usubsat X, SMIN --> (X ^ SMIN) & (X s>> BW-1) | |||
| 30138 | // If the target can use VPTERNLOG, DAGToDAG will match this as | |||
| 30139 | // "vpsra + vpternlog" which is better than "vpmax + vpsub" with a | |||
| 30140 | // "broadcast" constant load. | |||
| 30141 | ConstantSDNode *C = isConstOrConstSplat(Y, true); | |||
| 30142 | if (C && C->getAPIntValue().isSignMask()) { | |||
| 30143 | SDValue SignMask = DAG.getConstant(C->getAPIntValue(), DL, VT); | |||
| 30144 | SDValue ShiftAmt = DAG.getConstant(BitWidth - 1, DL, VT); | |||
| 30145 | SDValue Xor = DAG.getNode(ISD::XOR, DL, VT, X, SignMask); | |||
| 30146 | SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, X, ShiftAmt); | |||
| 30147 | return DAG.getNode(ISD::AND, DL, VT, Xor, Sra); | |||
| 30148 | } | |||
| 30149 | } | |||
| 30150 | if (!TLI.isOperationLegal(ISD::UMAX, VT)) { | |||
| 30151 | // usubsat X, Y --> (X >u Y) ? X - Y : 0 | |||
| 30152 | SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, X, Y); | |||
| 30153 | SDValue Cmp = DAG.getSetCC(DL, SetCCResultType, X, Y, ISD::SETUGT); | |||
| 30154 | // TODO: Move this to DAGCombiner? | |||
| 30155 | if (SetCCResultType == VT && | |||
| 30156 | DAG.ComputeNumSignBits(Cmp) == VT.getScalarSizeInBits()) | |||
| 30157 | return DAG.getNode(ISD::AND, DL, VT, Cmp, Sub); | |||
| 30158 | return DAG.getSelect(DL, VT, Cmp, Sub, DAG.getConstant(0, DL, VT)); | |||
| 30159 | } | |||
| 30160 | } | |||
| 30161 | ||||
| 30162 | if ((Opcode == ISD::SADDSAT || Opcode == ISD::SSUBSAT) && | |||
| 30163 | (!VT.isVector() || VT == MVT::v2i64)) { | |||
| 30164 | APInt MinVal = APInt::getSignedMinValue(BitWidth); | |||
| 30165 | APInt MaxVal = APInt::getSignedMaxValue(BitWidth); | |||
| 30166 | SDValue Zero = DAG.getConstant(0, DL, VT); | |||
| 30167 | SDValue Result = | |||
| 30168 | DAG.getNode(Opcode == ISD::SADDSAT ? ISD::SADDO : ISD::SSUBO, DL, | |||
| 30169 | DAG.getVTList(VT, SetCCResultType), X, Y); | |||
| 30170 | SDValue SumDiff = Result.getValue(0); | |||
| 30171 | SDValue Overflow = Result.getValue(1); | |||
| 30172 | SDValue SatMin = DAG.getConstant(MinVal, DL, VT); | |||
| 30173 | SDValue SatMax = DAG.getConstant(MaxVal, DL, VT); | |||
| 30174 | SDValue SumNeg = | |||
| 30175 | DAG.getSetCC(DL, SetCCResultType, SumDiff, Zero, ISD::SETLT); | |||
| 30176 | Result = DAG.getSelect(DL, VT, SumNeg, SatMax, SatMin); | |||
| 30177 | return DAG.getSelect(DL, VT, Overflow, Result, SumDiff); | |||
| 30178 | } | |||
| 30179 | ||||
| 30180 | // Use default expansion. | |||
| 30181 | return SDValue(); | |||
| 30182 | } | |||
| 30183 | ||||
| 30184 | static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30185 | SelectionDAG &DAG) { | |||
| 30186 | MVT VT = Op.getSimpleValueType(); | |||
| 30187 | if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { | |||
| 30188 | // Since X86 does not have CMOV for 8-bit integer, we don't convert | |||
| 30189 | // 8-bit integer abs to NEG and CMOV. | |||
| 30190 | SDLoc DL(Op); | |||
| 30191 | SDValue N0 = Op.getOperand(0); | |||
| 30192 | SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), | |||
| 30193 | DAG.getConstant(0, DL, VT), N0); | |||
| 30194 | SDValue Ops[] = {N0, Neg, DAG.getTargetConstant(X86::COND_NS, DL, MVT::i8), | |||
| 30195 | SDValue(Neg.getNode(), 1)}; | |||
| 30196 | return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); | |||
| 30197 | } | |||
| 30198 | ||||
| 30199 | // ABS(vXi64 X) --> VPBLENDVPD(X, 0-X, X). | |||
| 30200 | if ((VT == MVT::v2i64 || VT == MVT::v4i64) && Subtarget.hasSSE41()) { | |||
| 30201 | SDLoc DL(Op); | |||
| 30202 | SDValue Src = Op.getOperand(0); | |||
| 30203 | SDValue Sub = | |||
| 30204 | DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Src); | |||
| 30205 | return DAG.getNode(X86ISD::BLENDV, DL, VT, Src, Sub, Src); | |||
| 30206 | } | |||
| 30207 | ||||
| 30208 | if (VT.is256BitVector() && !Subtarget.hasInt256()) { | |||
| 30209 | assert(VT.isInteger() &&(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__ __PRETTY_FUNCTION__)) | |||
| 30210 | "Only handle AVX 256-bit vector integer operation")(static_cast <bool> (VT.isInteger() && "Only handle AVX 256-bit vector integer operation" ) ? void (0) : __assert_fail ("VT.isInteger() && \"Only handle AVX 256-bit vector integer operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30210, __extension__ __PRETTY_FUNCTION__)); | |||
| 30211 | return splitVectorIntUnary(Op, DAG); | |||
| 30212 | } | |||
| 30213 | ||||
| 30214 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) | |||
| 30215 | return splitVectorIntUnary(Op, DAG); | |||
| 30216 | ||||
| 30217 | // Default to expand. | |||
| 30218 | return SDValue(); | |||
| 30219 | } | |||
| 30220 | ||||
| 30221 | static SDValue LowerAVG(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30222 | SelectionDAG &DAG) { | |||
| 30223 | MVT VT = Op.getSimpleValueType(); | |||
| 30224 | ||||
| 30225 | // For AVX1 cases, split to use legal ops. | |||
| 30226 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30227 | return splitVectorIntBinary(Op, DAG); | |||
| 30228 | ||||
| 30229 | if (VT == MVT::v32i16 || VT == MVT::v64i8) | |||
| 30230 | return splitVectorIntBinary(Op, DAG); | |||
| 30231 | ||||
| 30232 | // Default to expand. | |||
| 30233 | return SDValue(); | |||
| 30234 | } | |||
| 30235 | ||||
| 30236 | static SDValue LowerMINMAX(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30237 | SelectionDAG &DAG) { | |||
| 30238 | MVT VT = Op.getSimpleValueType(); | |||
| 30239 | ||||
| 30240 | // For AVX1 cases, split to use legal ops. | |||
| 30241 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30242 | return splitVectorIntBinary(Op, DAG); | |||
| 30243 | ||||
| 30244 | if (VT == MVT::v32i16 || VT == MVT::v64i8) | |||
| 30245 | return splitVectorIntBinary(Op, DAG); | |||
| 30246 | ||||
| 30247 | // Default to expand. | |||
| 30248 | return SDValue(); | |||
| 30249 | } | |||
| 30250 | ||||
| 30251 | static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30252 | SelectionDAG &DAG) { | |||
| 30253 | assert((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) &&(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode" ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__ __PRETTY_FUNCTION__)) | |||
| 30254 | "Expected FMAXIMUM or FMINIMUM opcode")(static_cast <bool> ((Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && "Expected FMAXIMUM or FMINIMUM opcode" ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FMAXIMUM || Op.getOpcode() == ISD::FMINIMUM) && \"Expected FMAXIMUM or FMINIMUM opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30254, __extension__ __PRETTY_FUNCTION__)); | |||
| 30255 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 30256 | EVT VT = Op.getValueType(); | |||
| 30257 | SDValue X = Op.getOperand(0); | |||
| 30258 | SDValue Y = Op.getOperand(1); | |||
| 30259 | SDLoc DL(Op); | |||
| 30260 | uint64_t SizeInBits = VT.getFixedSizeInBits(); | |||
| 30261 | APInt PreferredZero = APInt::getZero(SizeInBits); | |||
| 30262 | EVT IVT = MVT::getIntegerVT(SizeInBits); | |||
| 30263 | X86ISD::NodeType MinMaxOp; | |||
| 30264 | if (Op.getOpcode() == ISD::FMAXIMUM) { | |||
| 30265 | MinMaxOp = X86ISD::FMAX; | |||
| 30266 | } else { | |||
| 30267 | PreferredZero.setSignBit(); | |||
| 30268 | MinMaxOp = X86ISD::FMIN; | |||
| 30269 | } | |||
| 30270 | EVT SetCCType = | |||
| 30271 | TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); | |||
| 30272 | ||||
| 30273 | // The tables below show the expected result of Max in cases of NaN and | |||
| 30274 | // signed zeros. | |||
| 30275 | // | |||
| 30276 | // Y Y | |||
| 30277 | // Num xNaN +0 -0 | |||
| 30278 | // --------------- --------------- | |||
| 30279 | // Num | Max | Y | +0 | +0 | +0 | | |||
| 30280 | // X --------------- X --------------- | |||
| 30281 | // xNaN | X | X/Y | -0 | +0 | -0 | | |||
| 30282 | // --------------- --------------- | |||
| 30283 | // | |||
| 30284 | // It is achieved by means of FMAX/FMIN with preliminary checks and operand | |||
| 30285 | // reordering. | |||
| 30286 | // | |||
| 30287 | // We check if any of operands is NaN and return NaN. Then we check if any of | |||
| 30288 | // operands is zero or negative zero (for fmaximum and fminimum respectively) | |||
| 30289 | // to ensure the correct zero is returned. | |||
| 30290 | auto IsPreferredZero = [PreferredZero](SDValue Op) { | |||
| 30291 | Op = peekThroughBitcasts(Op); | |||
| 30292 | if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op)) | |||
| 30293 | return CstOp->getValueAPF().bitcastToAPInt() == PreferredZero; | |||
| 30294 | if (auto *CstOp = dyn_cast<ConstantSDNode>(Op)) | |||
| 30295 | return CstOp->getAPIntValue() == PreferredZero; | |||
| 30296 | return false; | |||
| 30297 | }; | |||
| 30298 | ||||
| 30299 | bool IsXNeverNaN = DAG.isKnownNeverNaN(X); | |||
| 30300 | bool IsYNeverNaN = DAG.isKnownNeverNaN(Y); | |||
| 30301 | bool IgnoreSignedZero = DAG.getTarget().Options.NoSignedZerosFPMath || | |||
| 30302 | Op->getFlags().hasNoSignedZeros() || | |||
| 30303 | DAG.isKnownNeverZeroFloat(X) || | |||
| 30304 | DAG.isKnownNeverZeroFloat(Y); | |||
| 30305 | SDValue NewX, NewY; | |||
| 30306 | if (IgnoreSignedZero || IsPreferredZero(Y)) { | |||
| 30307 | // Operands are already in right order or order does not matter. | |||
| 30308 | NewX = X; | |||
| 30309 | NewY = Y; | |||
| 30310 | } else if (IsPreferredZero(X)) { | |||
| 30311 | NewX = Y; | |||
| 30312 | NewY = X; | |||
| 30313 | } else if ((VT == MVT::f16 || Subtarget.hasDQI()) && | |||
| 30314 | (Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) { | |||
| 30315 | if (IsXNeverNaN) | |||
| 30316 | std::swap(X, Y); | |||
| 30317 | // VFPCLASSS consumes a vector type. So provide a minimal one corresponded | |||
| 30318 | // xmm register. | |||
| 30319 | MVT VectorType = MVT::getVectorVT(VT.getSimpleVT(), 128 / SizeInBits); | |||
| 30320 | SDValue VX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorType, X); | |||
| 30321 | // Bits of classes: | |||
| 30322 | // Bits Imm8[0] Imm8[1] Imm8[2] Imm8[3] Imm8[4] Imm8[5] Imm8[6] Imm8[7] | |||
| 30323 | // Class QNAN PosZero NegZero PosINF NegINF Denormal Negative SNAN | |||
| 30324 | SDValue Imm = DAG.getTargetConstant(MinMaxOp == X86ISD::FMAX ? 0b11 : 0b101, | |||
| 30325 | DL, MVT::i32); | |||
| 30326 | SDValue IsNanZero = DAG.getNode(X86ISD::VFPCLASSS, DL, MVT::v1i1, VX, Imm); | |||
| 30327 | SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, | |||
| 30328 | DAG.getConstant(0, DL, MVT::v8i1), IsNanZero, | |||
| 30329 | DAG.getIntPtrConstant(0, DL)); | |||
| 30330 | SDValue NeedSwap = DAG.getBitcast(MVT::i8, Ins); | |||
| 30331 | NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X); | |||
| 30332 | NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y); | |||
| 30333 | return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); | |||
| 30334 | } else { | |||
| 30335 | SDValue IsXSigned; | |||
| 30336 | if (Subtarget.is64Bit() || VT != MVT::f64) { | |||
| 30337 | SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X); | |||
| 30338 | SDValue ZeroCst = DAG.getConstant(0, DL, IVT); | |||
| 30339 | IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT); | |||
| 30340 | } else { | |||
| 30341 | assert(VT == MVT::f64)(static_cast <bool> (VT == MVT::f64) ? void (0) : __assert_fail ("VT == MVT::f64", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 30341, __extension__ __PRETTY_FUNCTION__)); | |||
| 30342 | SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64, | |||
| 30343 | DAG.getConstantFP(0, DL, MVT::v2f64), X, | |||
| 30344 | DAG.getIntPtrConstant(0, DL)); | |||
| 30345 | SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins); | |||
| 30346 | SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX, | |||
| 30347 | DAG.getIntPtrConstant(1, DL)); | |||
| 30348 | Hi = DAG.getBitcast(MVT::i32, Hi); | |||
| 30349 | SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32); | |||
| 30350 | EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), | |||
| 30351 | *DAG.getContext(), MVT::i32); | |||
| 30352 | IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT); | |||
| 30353 | } | |||
| 30354 | if (MinMaxOp == X86ISD::FMAX) { | |||
| 30355 | NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y); | |||
| 30356 | NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X); | |||
| 30357 | } else { | |||
| 30358 | NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X); | |||
| 30359 | NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y); | |||
| 30360 | } | |||
| 30361 | } | |||
| 30362 | ||||
| 30363 | bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath || | |||
| 30364 | Op->getFlags().hasNoNaNs() || (IsXNeverNaN && IsYNeverNaN); | |||
| 30365 | ||||
| 30366 | // If we did no ordering operands for singed zero handling and we need | |||
| 30367 | // to process NaN and we know that the second operand is not NaN then put | |||
| 30368 | // it in first operand and we will not need to post handle NaN after max/min. | |||
| 30369 | if (IgnoreSignedZero && !IgnoreNaN && DAG.isKnownNeverNaN(NewY)) | |||
| 30370 | std::swap(NewX, NewY); | |||
| 30371 | ||||
| 30372 | SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags()); | |||
| 30373 | ||||
| 30374 | if (IgnoreNaN || DAG.isKnownNeverNaN(NewX)) | |||
| 30375 | return MinMax; | |||
| 30376 | ||||
| 30377 | SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NewX, NewX, ISD::SETUO); | |||
| 30378 | return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax); | |||
| 30379 | } | |||
| 30380 | ||||
| 30381 | static SDValue LowerABD(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30382 | SelectionDAG &DAG) { | |||
| 30383 | MVT VT = Op.getSimpleValueType(); | |||
| 30384 | ||||
| 30385 | // For AVX1 cases, split to use legal ops. | |||
| 30386 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30387 | return splitVectorIntBinary(Op, DAG); | |||
| 30388 | ||||
| 30389 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.useBWIRegs()) | |||
| 30390 | return splitVectorIntBinary(Op, DAG); | |||
| 30391 | ||||
| 30392 | SDLoc dl(Op); | |||
| 30393 | bool IsSigned = Op.getOpcode() == ISD::ABDS; | |||
| 30394 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 30395 | ||||
| 30396 | // TODO: Move to TargetLowering expandABD() once we have ABD promotion. | |||
| 30397 | if (VT.isScalarInteger()) { | |||
| 30398 | unsigned WideBits = std::max<unsigned>(2 * VT.getScalarSizeInBits(), 32u); | |||
| 30399 | MVT WideVT = MVT::getIntegerVT(WideBits); | |||
| 30400 | if (TLI.isTypeLegal(WideVT)) { | |||
| 30401 | // abds(lhs, rhs) -> trunc(abs(sub(sext(lhs), sext(rhs)))) | |||
| 30402 | // abdu(lhs, rhs) -> trunc(abs(sub(zext(lhs), zext(rhs)))) | |||
| 30403 | unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 30404 | SDValue LHS = DAG.getFreeze(Op.getOperand(0)); | |||
| 30405 | SDValue RHS = DAG.getFreeze(Op.getOperand(1)); | |||
| 30406 | LHS = DAG.getNode(ExtOpc, dl, WideVT, LHS); | |||
| 30407 | RHS = DAG.getNode(ExtOpc, dl, WideVT, RHS); | |||
| 30408 | SDValue Diff = DAG.getNode(ISD::SUB, dl, WideVT, LHS, RHS); | |||
| 30409 | SDValue AbsDiff = DAG.getNode(ISD::ABS, dl, WideVT, Diff); | |||
| 30410 | return DAG.getNode(ISD::TRUNCATE, dl, VT, AbsDiff); | |||
| 30411 | } | |||
| 30412 | } | |||
| 30413 | ||||
| 30414 | // Default to expand. | |||
| 30415 | return SDValue(); | |||
| 30416 | } | |||
| 30417 | ||||
| 30418 | static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30419 | SelectionDAG &DAG) { | |||
| 30420 | SDLoc dl(Op); | |||
| 30421 | MVT VT = Op.getSimpleValueType(); | |||
| 30422 | ||||
| 30423 | // Decompose 256-bit ops into 128-bit ops. | |||
| 30424 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30425 | return splitVectorIntBinary(Op, DAG); | |||
| 30426 | ||||
| 30427 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) | |||
| 30428 | return splitVectorIntBinary(Op, DAG); | |||
| 30429 | ||||
| 30430 | SDValue A = Op.getOperand(0); | |||
| 30431 | SDValue B = Op.getOperand(1); | |||
| 30432 | ||||
| 30433 | // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16 | |||
| 30434 | // vector pairs, multiply and truncate. | |||
| 30435 | if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) { | |||
| 30436 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 30437 | ||||
| 30438 | if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || | |||
| 30439 | (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { | |||
| 30440 | MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); | |||
| 30441 | return DAG.getNode( | |||
| 30442 | ISD::TRUNCATE, dl, VT, | |||
| 30443 | DAG.getNode(ISD::MUL, dl, ExVT, | |||
| 30444 | DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, A), | |||
| 30445 | DAG.getNode(ISD::ANY_EXTEND, dl, ExVT, B))); | |||
| 30446 | } | |||
| 30447 | ||||
| 30448 | MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 30449 | ||||
| 30450 | // Extract the lo/hi parts to any extend to i16. | |||
| 30451 | // We're going to mask off the low byte of each result element of the | |||
| 30452 | // pmullw, so it doesn't matter what's in the high byte of each 16-bit | |||
| 30453 | // element. | |||
| 30454 | SDValue Undef = DAG.getUNDEF(VT); | |||
| 30455 | SDValue ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Undef)); | |||
| 30456 | SDValue AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Undef)); | |||
| 30457 | ||||
| 30458 | SDValue BLo, BHi; | |||
| 30459 | if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { | |||
| 30460 | // If the RHS is a constant, manually unpackl/unpackh. | |||
| 30461 | SmallVector<SDValue, 16> LoOps, HiOps; | |||
| 30462 | for (unsigned i = 0; i != NumElts; i += 16) { | |||
| 30463 | for (unsigned j = 0; j != 8; ++j) { | |||
| 30464 | LoOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j), dl, | |||
| 30465 | MVT::i16)); | |||
| 30466 | HiOps.push_back(DAG.getAnyExtOrTrunc(B.getOperand(i + j + 8), dl, | |||
| 30467 | MVT::i16)); | |||
| 30468 | } | |||
| 30469 | } | |||
| 30470 | ||||
| 30471 | BLo = DAG.getBuildVector(ExVT, dl, LoOps); | |||
| 30472 | BHi = DAG.getBuildVector(ExVT, dl, HiOps); | |||
| 30473 | } else { | |||
| 30474 | BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Undef)); | |||
| 30475 | BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Undef)); | |||
| 30476 | } | |||
| 30477 | ||||
| 30478 | // Multiply, mask the lower 8bits of the lo/hi results and pack. | |||
| 30479 | SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo); | |||
| 30480 | SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi); | |||
| 30481 | return getPack(DAG, Subtarget, dl, VT, RLo, RHi); | |||
| 30482 | } | |||
| 30483 | ||||
| 30484 | // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle. | |||
| 30485 | if (VT == MVT::v4i32) { | |||
| 30486 | assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget .hasSSE41() && "Should not custom lower when pmulld is available!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__ __PRETTY_FUNCTION__)) | |||
| 30487 | "Should not custom lower when pmulld is available!")(static_cast <bool> (Subtarget.hasSSE2() && !Subtarget .hasSSE41() && "Should not custom lower when pmulld is available!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && !Subtarget.hasSSE41() && \"Should not custom lower when pmulld is available!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30487, __extension__ __PRETTY_FUNCTION__)); | |||
| 30488 | ||||
| 30489 | // Extract the odd parts. | |||
| 30490 | static const int UnpackMask[] = { 1, -1, 3, -1 }; | |||
| 30491 | SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask); | |||
| 30492 | SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask); | |||
| 30493 | ||||
| 30494 | // Multiply the even parts. | |||
| 30495 | SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, | |||
| 30496 | DAG.getBitcast(MVT::v2i64, A), | |||
| 30497 | DAG.getBitcast(MVT::v2i64, B)); | |||
| 30498 | // Now multiply odd parts. | |||
| 30499 | SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, | |||
| 30500 | DAG.getBitcast(MVT::v2i64, Aodds), | |||
| 30501 | DAG.getBitcast(MVT::v2i64, Bodds)); | |||
| 30502 | ||||
| 30503 | Evens = DAG.getBitcast(VT, Evens); | |||
| 30504 | Odds = DAG.getBitcast(VT, Odds); | |||
| 30505 | ||||
| 30506 | // Merge the two vectors back together with a shuffle. This expands into 2 | |||
| 30507 | // shuffles. | |||
| 30508 | static const int ShufMask[] = { 0, 4, 2, 6 }; | |||
| 30509 | return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask); | |||
| 30510 | } | |||
| 30511 | ||||
| 30512 | assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply" ) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__ __PRETTY_FUNCTION__)) | |||
| 30513 | "Only know how to lower V2I64/V4I64/V8I64 multiply")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && "Only know how to lower V2I64/V4I64/V8I64 multiply" ) ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) && \"Only know how to lower V2I64/V4I64/V8I64 multiply\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30513, __extension__ __PRETTY_FUNCTION__)); | |||
| 30514 | assert(!Subtarget.hasDQI() && "DQI should use MULLQ")(static_cast <bool> (!Subtarget.hasDQI() && "DQI should use MULLQ" ) ? void (0) : __assert_fail ("!Subtarget.hasDQI() && \"DQI should use MULLQ\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30514, __extension__ __PRETTY_FUNCTION__)); | |||
| 30515 | ||||
| 30516 | // Ahi = psrlqi(a, 32); | |||
| 30517 | // Bhi = psrlqi(b, 32); | |||
| 30518 | // | |||
| 30519 | // AloBlo = pmuludq(a, b); | |||
| 30520 | // AloBhi = pmuludq(a, Bhi); | |||
| 30521 | // AhiBlo = pmuludq(Ahi, b); | |||
| 30522 | // | |||
| 30523 | // Hi = psllqi(AloBhi + AhiBlo, 32); | |||
| 30524 | // return AloBlo + Hi; | |||
| 30525 | KnownBits AKnown = DAG.computeKnownBits(A); | |||
| 30526 | KnownBits BKnown = DAG.computeKnownBits(B); | |||
| 30527 | ||||
| 30528 | APInt LowerBitsMask = APInt::getLowBitsSet(64, 32); | |||
| 30529 | bool ALoIsZero = LowerBitsMask.isSubsetOf(AKnown.Zero); | |||
| 30530 | bool BLoIsZero = LowerBitsMask.isSubsetOf(BKnown.Zero); | |||
| 30531 | ||||
| 30532 | APInt UpperBitsMask = APInt::getHighBitsSet(64, 32); | |||
| 30533 | bool AHiIsZero = UpperBitsMask.isSubsetOf(AKnown.Zero); | |||
| 30534 | bool BHiIsZero = UpperBitsMask.isSubsetOf(BKnown.Zero); | |||
| 30535 | ||||
| 30536 | SDValue Zero = DAG.getConstant(0, dl, VT); | |||
| 30537 | ||||
| 30538 | // Only multiply lo/hi halves that aren't known to be zero. | |||
| 30539 | SDValue AloBlo = Zero; | |||
| 30540 | if (!ALoIsZero && !BLoIsZero) | |||
| 30541 | AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, B); | |||
| 30542 | ||||
| 30543 | SDValue AloBhi = Zero; | |||
| 30544 | if (!ALoIsZero && !BHiIsZero) { | |||
| 30545 | SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG); | |||
| 30546 | AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, A, Bhi); | |||
| 30547 | } | |||
| 30548 | ||||
| 30549 | SDValue AhiBlo = Zero; | |||
| 30550 | if (!AHiIsZero && !BLoIsZero) { | |||
| 30551 | SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG); | |||
| 30552 | AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, B); | |||
| 30553 | } | |||
| 30554 | ||||
| 30555 | SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo); | |||
| 30556 | Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG); | |||
| 30557 | ||||
| 30558 | return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi); | |||
| 30559 | } | |||
| 30560 | ||||
| 30561 | static SDValue LowervXi8MulWithUNPCK(SDValue A, SDValue B, const SDLoc &dl, | |||
| 30562 | MVT VT, bool IsSigned, | |||
| 30563 | const X86Subtarget &Subtarget, | |||
| 30564 | SelectionDAG &DAG, | |||
| 30565 | SDValue *Low = nullptr) { | |||
| 30566 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 30567 | ||||
| 30568 | // For vXi8 we will unpack the low and high half of each 128 bit lane to widen | |||
| 30569 | // to a vXi16 type. Do the multiplies, shift the results and pack the half | |||
| 30570 | // lane results back together. | |||
| 30571 | ||||
| 30572 | // We'll take different approaches for signed and unsigned. | |||
| 30573 | // For unsigned we'll use punpcklbw/punpckhbw to put zero extend the bytes | |||
| 30574 | // and use pmullw to calculate the full 16-bit product. | |||
| 30575 | // For signed we'll use punpcklbw/punpckbw to extend the bytes to words and | |||
| 30576 | // shift them left into the upper byte of each word. This allows us to use | |||
| 30577 | // pmulhw to calculate the full 16-bit product. This trick means we don't | |||
| 30578 | // need to sign extend the bytes to use pmullw. | |||
| 30579 | ||||
| 30580 | MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 30581 | SDValue Zero = DAG.getConstant(0, dl, VT); | |||
| 30582 | ||||
| 30583 | SDValue ALo, AHi; | |||
| 30584 | if (IsSigned) { | |||
| 30585 | ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, A)); | |||
| 30586 | AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, A)); | |||
| 30587 | } else { | |||
| 30588 | ALo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, A, Zero)); | |||
| 30589 | AHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, A, Zero)); | |||
| 30590 | } | |||
| 30591 | ||||
| 30592 | SDValue BLo, BHi; | |||
| 30593 | if (ISD::isBuildVectorOfConstantSDNodes(B.getNode())) { | |||
| 30594 | // If the RHS is a constant, manually unpackl/unpackh and extend. | |||
| 30595 | SmallVector<SDValue, 16> LoOps, HiOps; | |||
| 30596 | for (unsigned i = 0; i != NumElts; i += 16) { | |||
| 30597 | for (unsigned j = 0; j != 8; ++j) { | |||
| 30598 | SDValue LoOp = B.getOperand(i + j); | |||
| 30599 | SDValue HiOp = B.getOperand(i + j + 8); | |||
| 30600 | ||||
| 30601 | if (IsSigned) { | |||
| 30602 | LoOp = DAG.getAnyExtOrTrunc(LoOp, dl, MVT::i16); | |||
| 30603 | HiOp = DAG.getAnyExtOrTrunc(HiOp, dl, MVT::i16); | |||
| 30604 | LoOp = DAG.getNode(ISD::SHL, dl, MVT::i16, LoOp, | |||
| 30605 | DAG.getConstant(8, dl, MVT::i16)); | |||
| 30606 | HiOp = DAG.getNode(ISD::SHL, dl, MVT::i16, HiOp, | |||
| 30607 | DAG.getConstant(8, dl, MVT::i16)); | |||
| 30608 | } else { | |||
| 30609 | LoOp = DAG.getZExtOrTrunc(LoOp, dl, MVT::i16); | |||
| 30610 | HiOp = DAG.getZExtOrTrunc(HiOp, dl, MVT::i16); | |||
| 30611 | } | |||
| 30612 | ||||
| 30613 | LoOps.push_back(LoOp); | |||
| 30614 | HiOps.push_back(HiOp); | |||
| 30615 | } | |||
| 30616 | } | |||
| 30617 | ||||
| 30618 | BLo = DAG.getBuildVector(ExVT, dl, LoOps); | |||
| 30619 | BHi = DAG.getBuildVector(ExVT, dl, HiOps); | |||
| 30620 | } else if (IsSigned) { | |||
| 30621 | BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, Zero, B)); | |||
| 30622 | BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, Zero, B)); | |||
| 30623 | } else { | |||
| 30624 | BLo = DAG.getBitcast(ExVT, getUnpackl(DAG, dl, VT, B, Zero)); | |||
| 30625 | BHi = DAG.getBitcast(ExVT, getUnpackh(DAG, dl, VT, B, Zero)); | |||
| 30626 | } | |||
| 30627 | ||||
| 30628 | // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and | |||
| 30629 | // pack back to vXi8. | |||
| 30630 | unsigned MulOpc = IsSigned ? ISD::MULHS : ISD::MUL; | |||
| 30631 | SDValue RLo = DAG.getNode(MulOpc, dl, ExVT, ALo, BLo); | |||
| 30632 | SDValue RHi = DAG.getNode(MulOpc, dl, ExVT, AHi, BHi); | |||
| 30633 | ||||
| 30634 | if (Low) | |||
| 30635 | *Low = getPack(DAG, Subtarget, dl, VT, RLo, RHi); | |||
| 30636 | ||||
| 30637 | return getPack(DAG, Subtarget, dl, VT, RLo, RHi, /*PackHiHalf*/ true); | |||
| 30638 | } | |||
| 30639 | ||||
| 30640 | static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30641 | SelectionDAG &DAG) { | |||
| 30642 | SDLoc dl(Op); | |||
| 30643 | MVT VT = Op.getSimpleValueType(); | |||
| 30644 | bool IsSigned = Op->getOpcode() == ISD::MULHS; | |||
| 30645 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 30646 | SDValue A = Op.getOperand(0); | |||
| 30647 | SDValue B = Op.getOperand(1); | |||
| 30648 | ||||
| 30649 | // Decompose 256-bit ops into 128-bit ops. | |||
| 30650 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 30651 | return splitVectorIntBinary(Op, DAG); | |||
| 30652 | ||||
| 30653 | if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) | |||
| 30654 | return splitVectorIntBinary(Op, DAG); | |||
| 30655 | ||||
| 30656 | if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) { | |||
| 30657 | assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget .hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256 ()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ? void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__ __PRETTY_FUNCTION__)) | |||
| 30658 | (VT == MVT::v8i32 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v4i32 && Subtarget .hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256 ()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ? void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__ __PRETTY_FUNCTION__)) | |||
| 30659 | (VT == MVT::v16i32 && Subtarget.hasAVX512()))(static_cast <bool> ((VT == MVT::v4i32 && Subtarget .hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256 ()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())) ? void (0) : __assert_fail ("(VT == MVT::v4i32 && Subtarget.hasSSE2()) || (VT == MVT::v8i32 && Subtarget.hasInt256()) || (VT == MVT::v16i32 && Subtarget.hasAVX512())" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30659, __extension__ __PRETTY_FUNCTION__)); | |||
| 30660 | ||||
| 30661 | // PMULxD operations multiply each even value (starting at 0) of LHS with | |||
| 30662 | // the related value of RHS and produce a widen result. | |||
| 30663 | // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> | |||
| 30664 | // => <2 x i64> <ae|cg> | |||
| 30665 | // | |||
| 30666 | // In other word, to have all the results, we need to perform two PMULxD: | |||
| 30667 | // 1. one with the even values. | |||
| 30668 | // 2. one with the odd values. | |||
| 30669 | // To achieve #2, with need to place the odd values at an even position. | |||
| 30670 | // | |||
| 30671 | // Place the odd value at an even position (basically, shift all values 1 | |||
| 30672 | // step to the left): | |||
| 30673 | const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, | |||
| 30674 | 9, -1, 11, -1, 13, -1, 15, -1}; | |||
| 30675 | // <a|b|c|d> => <b|undef|d|undef> | |||
| 30676 | SDValue Odd0 = | |||
| 30677 | DAG.getVectorShuffle(VT, dl, A, A, ArrayRef(&Mask[0], NumElts)); | |||
| 30678 | // <e|f|g|h> => <f|undef|h|undef> | |||
| 30679 | SDValue Odd1 = | |||
| 30680 | DAG.getVectorShuffle(VT, dl, B, B, ArrayRef(&Mask[0], NumElts)); | |||
| 30681 | ||||
| 30682 | // Emit two multiplies, one for the lower 2 ints and one for the higher 2 | |||
| 30683 | // ints. | |||
| 30684 | MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); | |||
| 30685 | unsigned Opcode = | |||
| 30686 | (IsSigned && Subtarget.hasSSE41()) ? X86ISD::PMULDQ : X86ISD::PMULUDQ; | |||
| 30687 | // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h> | |||
| 30688 | // => <2 x i64> <ae|cg> | |||
| 30689 | SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, | |||
| 30690 | DAG.getBitcast(MulVT, A), | |||
| 30691 | DAG.getBitcast(MulVT, B))); | |||
| 30692 | // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef> | |||
| 30693 | // => <2 x i64> <bf|dh> | |||
| 30694 | SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, | |||
| 30695 | DAG.getBitcast(MulVT, Odd0), | |||
| 30696 | DAG.getBitcast(MulVT, Odd1))); | |||
| 30697 | ||||
| 30698 | // Shuffle it back into the right order. | |||
| 30699 | SmallVector<int, 16> ShufMask(NumElts); | |||
| 30700 | for (int i = 0; i != (int)NumElts; ++i) | |||
| 30701 | ShufMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; | |||
| 30702 | ||||
| 30703 | SDValue Res = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, ShufMask); | |||
| 30704 | ||||
| 30705 | // If we have a signed multiply but no PMULDQ fix up the result of an | |||
| 30706 | // unsigned multiply. | |||
| 30707 | if (IsSigned && !Subtarget.hasSSE41()) { | |||
| 30708 | SDValue Zero = DAG.getConstant(0, dl, VT); | |||
| 30709 | SDValue T1 = DAG.getNode(ISD::AND, dl, VT, | |||
| 30710 | DAG.getSetCC(dl, VT, Zero, A, ISD::SETGT), B); | |||
| 30711 | SDValue T2 = DAG.getNode(ISD::AND, dl, VT, | |||
| 30712 | DAG.getSetCC(dl, VT, Zero, B, ISD::SETGT), A); | |||
| 30713 | ||||
| 30714 | SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2); | |||
| 30715 | Res = DAG.getNode(ISD::SUB, dl, VT, Res, Fixup); | |||
| 30716 | } | |||
| 30717 | ||||
| 30718 | return Res; | |||
| 30719 | } | |||
| 30720 | ||||
| 30721 | // Only i8 vectors should need custom lowering after this. | |||
| 30722 | assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__ __PRETTY_FUNCTION__)) | |||
| 30723 | (VT == MVT::v64i8 && Subtarget.hasBWI())) &&(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__ __PRETTY_FUNCTION__)) | |||
| 30724 | "Unsupported vector type")(static_cast <bool> ((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || (VT == MVT::v64i8 && Subtarget.hasBWI())) && \"Unsupported vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30724, __extension__ __PRETTY_FUNCTION__)); | |||
| 30725 | ||||
| 30726 | // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, | |||
| 30727 | // logical shift down the upper half and pack back to i8. | |||
| 30728 | ||||
| 30729 | // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack | |||
| 30730 | // and then ashr/lshr the upper bits down to the lower bits before multiply. | |||
| 30731 | ||||
| 30732 | if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || | |||
| 30733 | (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { | |||
| 30734 | MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); | |||
| 30735 | unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 30736 | SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); | |||
| 30737 | SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); | |||
| 30738 | SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); | |||
| 30739 | Mul = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); | |||
| 30740 | return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); | |||
| 30741 | } | |||
| 30742 | ||||
| 30743 | return LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG); | |||
| 30744 | } | |||
| 30745 | ||||
| 30746 | // Custom lowering for SMULO/UMULO. | |||
| 30747 | static SDValue LowerMULO(SDValue Op, const X86Subtarget &Subtarget, | |||
| 30748 | SelectionDAG &DAG) { | |||
| 30749 | MVT VT = Op.getSimpleValueType(); | |||
| 30750 | ||||
| 30751 | // Scalars defer to LowerXALUO. | |||
| 30752 | if (!VT.isVector()) | |||
| 30753 | return LowerXALUO(Op, DAG); | |||
| 30754 | ||||
| 30755 | SDLoc dl(Op); | |||
| 30756 | bool IsSigned = Op->getOpcode() == ISD::SMULO; | |||
| 30757 | SDValue A = Op.getOperand(0); | |||
| 30758 | SDValue B = Op.getOperand(1); | |||
| 30759 | EVT OvfVT = Op->getValueType(1); | |||
| 30760 | ||||
| 30761 | if ((VT == MVT::v32i8 && !Subtarget.hasInt256()) || | |||
| 30762 | (VT == MVT::v64i8 && !Subtarget.hasBWI())) { | |||
| 30763 | // Extract the LHS Lo/Hi vectors | |||
| 30764 | SDValue LHSLo, LHSHi; | |||
| 30765 | std::tie(LHSLo, LHSHi) = splitVector(A, DAG, dl); | |||
| 30766 | ||||
| 30767 | // Extract the RHS Lo/Hi vectors | |||
| 30768 | SDValue RHSLo, RHSHi; | |||
| 30769 | std::tie(RHSLo, RHSHi) = splitVector(B, DAG, dl); | |||
| 30770 | ||||
| 30771 | EVT LoOvfVT, HiOvfVT; | |||
| 30772 | std::tie(LoOvfVT, HiOvfVT) = DAG.GetSplitDestVTs(OvfVT); | |||
| 30773 | SDVTList LoVTs = DAG.getVTList(LHSLo.getValueType(), LoOvfVT); | |||
| 30774 | SDVTList HiVTs = DAG.getVTList(LHSHi.getValueType(), HiOvfVT); | |||
| 30775 | ||||
| 30776 | // Issue the split operations. | |||
| 30777 | SDValue Lo = DAG.getNode(Op.getOpcode(), dl, LoVTs, LHSLo, RHSLo); | |||
| 30778 | SDValue Hi = DAG.getNode(Op.getOpcode(), dl, HiVTs, LHSHi, RHSHi); | |||
| 30779 | ||||
| 30780 | // Join the separate data results and the overflow results. | |||
| 30781 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 30782 | SDValue Ovf = DAG.getNode(ISD::CONCAT_VECTORS, dl, OvfVT, Lo.getValue(1), | |||
| 30783 | Hi.getValue(1)); | |||
| 30784 | ||||
| 30785 | return DAG.getMergeValues({Res, Ovf}, dl); | |||
| 30786 | } | |||
| 30787 | ||||
| 30788 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 30789 | EVT SetccVT = | |||
| 30790 | TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT); | |||
| 30791 | ||||
| 30792 | if ((VT == MVT::v16i8 && Subtarget.hasInt256()) || | |||
| 30793 | (VT == MVT::v32i8 && Subtarget.canExtendTo512BW())) { | |||
| 30794 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 30795 | MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); | |||
| 30796 | unsigned ExAVX = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 30797 | SDValue ExA = DAG.getNode(ExAVX, dl, ExVT, A); | |||
| 30798 | SDValue ExB = DAG.getNode(ExAVX, dl, ExVT, B); | |||
| 30799 | SDValue Mul = DAG.getNode(ISD::MUL, dl, ExVT, ExA, ExB); | |||
| 30800 | ||||
| 30801 | SDValue Low = DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); | |||
| 30802 | ||||
| 30803 | SDValue Ovf; | |||
| 30804 | if (IsSigned) { | |||
| 30805 | SDValue High, LowSign; | |||
| 30806 | if (OvfVT.getVectorElementType() == MVT::i1 && | |||
| 30807 | (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) { | |||
| 30808 | // Rather the truncating try to do the compare on vXi16 or vXi32. | |||
| 30809 | // Shift the high down filling with sign bits. | |||
| 30810 | High = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Mul, 8, DAG); | |||
| 30811 | // Fill all 16 bits with the sign bit from the low. | |||
| 30812 | LowSign = | |||
| 30813 | getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExVT, Mul, 8, DAG); | |||
| 30814 | LowSign = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, LowSign, | |||
| 30815 | 15, DAG); | |||
| 30816 | SetccVT = OvfVT; | |||
| 30817 | if (!Subtarget.hasBWI()) { | |||
| 30818 | // We can't do a vXi16 compare so sign extend to v16i32. | |||
| 30819 | High = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, High); | |||
| 30820 | LowSign = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v16i32, LowSign); | |||
| 30821 | } | |||
| 30822 | } else { | |||
| 30823 | // Otherwise do the compare at vXi8. | |||
| 30824 | High = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); | |||
| 30825 | High = DAG.getNode(ISD::TRUNCATE, dl, VT, High); | |||
| 30826 | LowSign = | |||
| 30827 | DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); | |||
| 30828 | } | |||
| 30829 | ||||
| 30830 | Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE); | |||
| 30831 | } else { | |||
| 30832 | SDValue High = | |||
| 30833 | getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Mul, 8, DAG); | |||
| 30834 | if (OvfVT.getVectorElementType() == MVT::i1 && | |||
| 30835 | (Subtarget.hasBWI() || Subtarget.canExtendTo512DQ())) { | |||
| 30836 | // Rather the truncating try to do the compare on vXi16 or vXi32. | |||
| 30837 | SetccVT = OvfVT; | |||
| 30838 | if (!Subtarget.hasBWI()) { | |||
| 30839 | // We can't do a vXi16 compare so sign extend to v16i32. | |||
| 30840 | High = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, High); | |||
| 30841 | } | |||
| 30842 | } else { | |||
| 30843 | // Otherwise do the compare at vXi8. | |||
| 30844 | High = DAG.getNode(ISD::TRUNCATE, dl, VT, High); | |||
| 30845 | } | |||
| 30846 | ||||
| 30847 | Ovf = | |||
| 30848 | DAG.getSetCC(dl, SetccVT, High, | |||
| 30849 | DAG.getConstant(0, dl, High.getValueType()), ISD::SETNE); | |||
| 30850 | } | |||
| 30851 | ||||
| 30852 | Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT); | |||
| 30853 | ||||
| 30854 | return DAG.getMergeValues({Low, Ovf}, dl); | |||
| 30855 | } | |||
| 30856 | ||||
| 30857 | SDValue Low; | |||
| 30858 | SDValue High = | |||
| 30859 | LowervXi8MulWithUNPCK(A, B, dl, VT, IsSigned, Subtarget, DAG, &Low); | |||
| 30860 | ||||
| 30861 | SDValue Ovf; | |||
| 30862 | if (IsSigned) { | |||
| 30863 | // SMULO overflows if the high bits don't match the sign of the low. | |||
| 30864 | SDValue LowSign = | |||
| 30865 | DAG.getNode(ISD::SRA, dl, VT, Low, DAG.getConstant(7, dl, VT)); | |||
| 30866 | Ovf = DAG.getSetCC(dl, SetccVT, LowSign, High, ISD::SETNE); | |||
| 30867 | } else { | |||
| 30868 | // UMULO overflows if the high bits are non-zero. | |||
| 30869 | Ovf = | |||
| 30870 | DAG.getSetCC(dl, SetccVT, High, DAG.getConstant(0, dl, VT), ISD::SETNE); | |||
| 30871 | } | |||
| 30872 | ||||
| 30873 | Ovf = DAG.getSExtOrTrunc(Ovf, dl, OvfVT); | |||
| 30874 | ||||
| 30875 | return DAG.getMergeValues({Low, Ovf}, dl); | |||
| 30876 | } | |||
| 30877 | ||||
| 30878 | SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const { | |||
| 30879 | assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() && "Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30879, __extension__ __PRETTY_FUNCTION__)); | |||
| 30880 | EVT VT = Op.getValueType(); | |||
| 30881 | assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits () == 128 && "Unexpected return type for lowering") ? void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__ __PRETTY_FUNCTION__)) | |||
| 30882 | "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits () == 128 && "Unexpected return type for lowering") ? void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30882, __extension__ __PRETTY_FUNCTION__)); | |||
| 30883 | ||||
| 30884 | if (isa<ConstantSDNode>(Op->getOperand(1))) { | |||
| 30885 | SmallVector<SDValue> Result; | |||
| 30886 | if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i64, DAG)) | |||
| 30887 | return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), VT, Result[0], Result[1]); | |||
| 30888 | } | |||
| 30889 | ||||
| 30890 | RTLIB::Libcall LC; | |||
| 30891 | bool isSigned; | |||
| 30892 | switch (Op->getOpcode()) { | |||
| 30893 | default: llvm_unreachable("Unexpected request for libcall!")::llvm::llvm_unreachable_internal("Unexpected request for libcall!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30893); | |||
| 30894 | case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break; | |||
| 30895 | case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break; | |||
| 30896 | case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break; | |||
| 30897 | case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break; | |||
| 30898 | } | |||
| 30899 | ||||
| 30900 | SDLoc dl(Op); | |||
| 30901 | SDValue InChain = DAG.getEntryNode(); | |||
| 30902 | ||||
| 30903 | TargetLowering::ArgListTy Args; | |||
| 30904 | TargetLowering::ArgListEntry Entry; | |||
| 30905 | for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) { | |||
| 30906 | EVT ArgVT = Op->getOperand(i).getValueType(); | |||
| 30907 | assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT .getSizeInBits() == 128 && "Unexpected argument type for lowering" ) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__ __PRETTY_FUNCTION__)) | |||
| 30908 | "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT .getSizeInBits() == 128 && "Unexpected argument type for lowering" ) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30908, __extension__ __PRETTY_FUNCTION__)); | |||
| 30909 | SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); | |||
| 30910 | int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); | |||
| 30911 | MachinePointerInfo MPI = | |||
| 30912 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); | |||
| 30913 | Entry.Node = StackPtr; | |||
| 30914 | InChain = | |||
| 30915 | DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MPI, Align(16)); | |||
| 30916 | Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); | |||
| 30917 | Entry.Ty = PointerType::get(ArgTy,0); | |||
| 30918 | Entry.IsSExt = false; | |||
| 30919 | Entry.IsZExt = false; | |||
| 30920 | Args.push_back(Entry); | |||
| 30921 | } | |||
| 30922 | ||||
| 30923 | SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), | |||
| 30924 | getPointerTy(DAG.getDataLayout())); | |||
| 30925 | ||||
| 30926 | TargetLowering::CallLoweringInfo CLI(DAG); | |||
| 30927 | CLI.setDebugLoc(dl) | |||
| 30928 | .setChain(InChain) | |||
| 30929 | .setLibCallee( | |||
| 30930 | getLibcallCallingConv(LC), | |||
| 30931 | static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee, | |||
| 30932 | std::move(Args)) | |||
| 30933 | .setInRegister() | |||
| 30934 | .setSExtResult(isSigned) | |||
| 30935 | .setZExtResult(!isSigned); | |||
| 30936 | ||||
| 30937 | std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI); | |||
| 30938 | return DAG.getBitcast(VT, CallInfo.first); | |||
| 30939 | } | |||
| 30940 | ||||
| 30941 | SDValue X86TargetLowering::LowerWin64_FP_TO_INT128(SDValue Op, | |||
| 30942 | SelectionDAG &DAG, | |||
| 30943 | SDValue &Chain) const { | |||
| 30944 | assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() && "Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30944, __extension__ __PRETTY_FUNCTION__)); | |||
| 30945 | EVT VT = Op.getValueType(); | |||
| 30946 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 30947 | ||||
| 30948 | SDValue Arg = Op.getOperand(IsStrict ? 1 : 0); | |||
| 30949 | EVT ArgVT = Arg.getValueType(); | |||
| 30950 | ||||
| 30951 | assert(VT.isInteger() && VT.getSizeInBits() == 128 &&(static_cast <bool> (VT.isInteger() && VT.getSizeInBits () == 128 && "Unexpected return type for lowering") ? void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__ __PRETTY_FUNCTION__)) | |||
| 30952 | "Unexpected return type for lowering")(static_cast <bool> (VT.isInteger() && VT.getSizeInBits () == 128 && "Unexpected return type for lowering") ? void (0) : __assert_fail ("VT.isInteger() && VT.getSizeInBits() == 128 && \"Unexpected return type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30952, __extension__ __PRETTY_FUNCTION__)); | |||
| 30953 | ||||
| 30954 | RTLIB::Libcall LC; | |||
| 30955 | if (Op->getOpcode() == ISD::FP_TO_SINT || | |||
| 30956 | Op->getOpcode() == ISD::STRICT_FP_TO_SINT) | |||
| 30957 | LC = RTLIB::getFPTOSINT(ArgVT, VT); | |||
| 30958 | else | |||
| 30959 | LC = RTLIB::getFPTOUINT(ArgVT, VT); | |||
| 30960 | assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!") ? void (0) : __assert_fail ("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30960, __extension__ __PRETTY_FUNCTION__)); | |||
| 30961 | ||||
| 30962 | SDLoc dl(Op); | |||
| 30963 | MakeLibCallOptions CallOptions; | |||
| 30964 | Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 30965 | ||||
| 30966 | SDValue Result; | |||
| 30967 | // Expect the i128 argument returned as a v2i64 in xmm0, cast back to the | |||
| 30968 | // expected VT (i128). | |||
| 30969 | std::tie(Result, Chain) = | |||
| 30970 | makeLibCall(DAG, LC, MVT::v2i64, Arg, CallOptions, dl, Chain); | |||
| 30971 | Result = DAG.getBitcast(VT, Result); | |||
| 30972 | return Result; | |||
| 30973 | } | |||
| 30974 | ||||
| 30975 | SDValue X86TargetLowering::LowerWin64_INT128_TO_FP(SDValue Op, | |||
| 30976 | SelectionDAG &DAG) const { | |||
| 30977 | assert(Subtarget.isTargetWin64() && "Unexpected target")(static_cast <bool> (Subtarget.isTargetWin64() && "Unexpected target") ? void (0) : __assert_fail ("Subtarget.isTargetWin64() && \"Unexpected target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30977, __extension__ __PRETTY_FUNCTION__)); | |||
| 30978 | EVT VT = Op.getValueType(); | |||
| 30979 | bool IsStrict = Op->isStrictFPOpcode(); | |||
| 30980 | ||||
| 30981 | SDValue Arg = Op.getOperand(IsStrict ? 1 : 0); | |||
| 30982 | EVT ArgVT = Arg.getValueType(); | |||
| 30983 | ||||
| 30984 | assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&(static_cast <bool> (ArgVT.isInteger() && ArgVT .getSizeInBits() == 128 && "Unexpected argument type for lowering" ) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__ __PRETTY_FUNCTION__)) | |||
| 30985 | "Unexpected argument type for lowering")(static_cast <bool> (ArgVT.isInteger() && ArgVT .getSizeInBits() == 128 && "Unexpected argument type for lowering" ) ? void (0) : __assert_fail ("ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 && \"Unexpected argument type for lowering\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30985, __extension__ __PRETTY_FUNCTION__)); | |||
| 30986 | ||||
| 30987 | RTLIB::Libcall LC; | |||
| 30988 | if (Op->getOpcode() == ISD::SINT_TO_FP || | |||
| 30989 | Op->getOpcode() == ISD::STRICT_SINT_TO_FP) | |||
| 30990 | LC = RTLIB::getSINTTOFP(ArgVT, VT); | |||
| 30991 | else | |||
| 30992 | LC = RTLIB::getUINTTOFP(ArgVT, VT); | |||
| 30993 | assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!")(static_cast <bool> (LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected request for libcall!") ? void (0) : __assert_fail ("LC != RTLIB::UNKNOWN_LIBCALL && \"Unexpected request for libcall!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 30993, __extension__ __PRETTY_FUNCTION__)); | |||
| 30994 | ||||
| 30995 | SDLoc dl(Op); | |||
| 30996 | MakeLibCallOptions CallOptions; | |||
| 30997 | SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode(); | |||
| 30998 | ||||
| 30999 | // Pass the i128 argument as an indirect argument on the stack. | |||
| 31000 | SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16); | |||
| 31001 | int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); | |||
| 31002 | MachinePointerInfo MPI = | |||
| 31003 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); | |||
| 31004 | Chain = DAG.getStore(Chain, dl, Arg, StackPtr, MPI, Align(16)); | |||
| 31005 | ||||
| 31006 | SDValue Result; | |||
| 31007 | std::tie(Result, Chain) = | |||
| 31008 | makeLibCall(DAG, LC, VT, StackPtr, CallOptions, dl, Chain); | |||
| 31009 | return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result; | |||
| 31010 | } | |||
| 31011 | ||||
| 31012 | // Return true if the required (according to Opcode) shift-imm form is natively | |||
| 31013 | // supported by the Subtarget | |||
| 31014 | static bool supportedVectorShiftWithImm(EVT VT, const X86Subtarget &Subtarget, | |||
| 31015 | unsigned Opcode) { | |||
| 31016 | if (!VT.isSimple()) | |||
| 31017 | return false; | |||
| 31018 | ||||
| 31019 | if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) | |||
| 31020 | return false; | |||
| 31021 | ||||
| 31022 | if (VT.getScalarSizeInBits() < 16) | |||
| 31023 | return false; | |||
| 31024 | ||||
| 31025 | if (VT.is512BitVector() && Subtarget.useAVX512Regs() && | |||
| 31026 | (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI())) | |||
| 31027 | return true; | |||
| 31028 | ||||
| 31029 | bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 31030 | (VT.is256BitVector() && Subtarget.hasInt256()); | |||
| 31031 | ||||
| 31032 | bool AShift = LShift && (Subtarget.hasAVX512() || | |||
| 31033 | (VT != MVT::v2i64 && VT != MVT::v4i64)); | |||
| 31034 | return (Opcode == ISD::SRA) ? AShift : LShift; | |||
| 31035 | } | |||
| 31036 | ||||
| 31037 | // The shift amount is a variable, but it is the same for all vector lanes. | |||
| 31038 | // These instructions are defined together with shift-immediate. | |||
| 31039 | static | |||
| 31040 | bool supportedVectorShiftWithBaseAmnt(EVT VT, const X86Subtarget &Subtarget, | |||
| 31041 | unsigned Opcode) { | |||
| 31042 | return supportedVectorShiftWithImm(VT, Subtarget, Opcode); | |||
| 31043 | } | |||
| 31044 | ||||
| 31045 | // Return true if the required (according to Opcode) variable-shift form is | |||
| 31046 | // natively supported by the Subtarget | |||
| 31047 | static bool supportedVectorVarShift(EVT VT, const X86Subtarget &Subtarget, | |||
| 31048 | unsigned Opcode) { | |||
| 31049 | if (!VT.isSimple()) | |||
| 31050 | return false; | |||
| 31051 | ||||
| 31052 | if (!(VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector())) | |||
| 31053 | return false; | |||
| 31054 | ||||
| 31055 | if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16) | |||
| 31056 | return false; | |||
| 31057 | ||||
| 31058 | // vXi16 supported only on AVX-512, BWI | |||
| 31059 | if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI()) | |||
| 31060 | return false; | |||
| 31061 | ||||
| 31062 | if (Subtarget.hasAVX512() && | |||
| 31063 | (Subtarget.useAVX512Regs() || !VT.is512BitVector())) | |||
| 31064 | return true; | |||
| 31065 | ||||
| 31066 | bool LShift = VT.is128BitVector() || VT.is256BitVector(); | |||
| 31067 | bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64; | |||
| 31068 | return (Opcode == ISD::SRA) ? AShift : LShift; | |||
| 31069 | } | |||
| 31070 | ||||
| 31071 | static SDValue LowerShiftByScalarImmediate(SDValue Op, SelectionDAG &DAG, | |||
| 31072 | const X86Subtarget &Subtarget) { | |||
| 31073 | MVT VT = Op.getSimpleValueType(); | |||
| 31074 | SDLoc dl(Op); | |||
| 31075 | SDValue R = Op.getOperand(0); | |||
| 31076 | SDValue Amt = Op.getOperand(1); | |||
| 31077 | unsigned X86Opc = getTargetVShiftUniformOpcode(Op.getOpcode(), false); | |||
| 31078 | ||||
| 31079 | auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) { | |||
| 31080 | assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type")(static_cast <bool> ((VT == MVT::v2i64 || VT == MVT::v4i64 ) && "Unexpected SRA type") ? void (0) : __assert_fail ("(VT == MVT::v2i64 || VT == MVT::v4i64) && \"Unexpected SRA type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31080, __extension__ __PRETTY_FUNCTION__)); | |||
| 31081 | MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); | |||
| 31082 | SDValue Ex = DAG.getBitcast(ExVT, R); | |||
| 31083 | ||||
| 31084 | // ashr(R, 63) === cmp_slt(R, 0) | |||
| 31085 | if (ShiftAmt == 63 && Subtarget.hasSSE42()) { | |||
| 31086 | assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256 ()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__ __PRETTY_FUNCTION__)) | |||
| 31087 | "Unsupported PCMPGT op")(static_cast <bool> ((VT != MVT::v4i64 || Subtarget.hasInt256 ()) && "Unsupported PCMPGT op") ? void (0) : __assert_fail ("(VT != MVT::v4i64 || Subtarget.hasInt256()) && \"Unsupported PCMPGT op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31087, __extension__ __PRETTY_FUNCTION__)); | |||
| 31088 | return DAG.getNode(X86ISD::PCMPGT, dl, VT, DAG.getConstant(0, dl, VT), R); | |||
| 31089 | } | |||
| 31090 | ||||
| 31091 | if (ShiftAmt >= 32) { | |||
| 31092 | // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. | |||
| 31093 | SDValue Upper = | |||
| 31094 | getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG); | |||
| 31095 | SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, | |||
| 31096 | ShiftAmt - 32, DAG); | |||
| 31097 | if (VT == MVT::v2i64) | |||
| 31098 | Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3}); | |||
| 31099 | if (VT == MVT::v4i64) | |||
| 31100 | Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, | |||
| 31101 | {9, 1, 11, 3, 13, 5, 15, 7}); | |||
| 31102 | } else { | |||
| 31103 | // SRA upper i32, SRL whole i64 and select lower i32. | |||
| 31104 | SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, | |||
| 31105 | ShiftAmt, DAG); | |||
| 31106 | SDValue Lower = | |||
| 31107 | getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG); | |||
| 31108 | Lower = DAG.getBitcast(ExVT, Lower); | |||
| 31109 | if (VT == MVT::v2i64) | |||
| 31110 | Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3}); | |||
| 31111 | if (VT == MVT::v4i64) | |||
| 31112 | Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, | |||
| 31113 | {8, 1, 10, 3, 12, 5, 14, 7}); | |||
| 31114 | } | |||
| 31115 | return DAG.getBitcast(VT, Ex); | |||
| 31116 | }; | |||
| 31117 | ||||
| 31118 | // Optimize shl/srl/sra with constant shift amount. | |||
| 31119 | APInt APIntShiftAmt; | |||
| 31120 | if (!X86::isConstantSplat(Amt, APIntShiftAmt)) | |||
| 31121 | return SDValue(); | |||
| 31122 | ||||
| 31123 | // If the shift amount is out of range, return undef. | |||
| 31124 | if (APIntShiftAmt.uge(VT.getScalarSizeInBits())) | |||
| 31125 | return DAG.getUNDEF(VT); | |||
| 31126 | ||||
| 31127 | uint64_t ShiftAmt = APIntShiftAmt.getZExtValue(); | |||
| 31128 | ||||
| 31129 | if (supportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode())) { | |||
| 31130 | // Hardware support for vector shifts is sparse which makes us scalarize the | |||
| 31131 | // vector operations in many cases. Also, on sandybridge ADD is faster than | |||
| 31132 | // shl: (shl V, 1) -> (add (freeze V), (freeze V)) | |||
| 31133 | if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { | |||
| 31134 | // R may be undef at run-time, but (shl R, 1) must be an even number (LSB | |||
| 31135 | // must be 0). (add undef, undef) however can be any value. To make this | |||
| 31136 | // safe, we must freeze R to ensure that register allocation uses the same | |||
| 31137 | // register for an undefined value. This ensures that the result will | |||
| 31138 | // still be even and preserves the original semantics. | |||
| 31139 | R = DAG.getFreeze(R); | |||
| 31140 | return DAG.getNode(ISD::ADD, dl, VT, R, R); | |||
| 31141 | } | |||
| 31142 | ||||
| 31143 | return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG); | |||
| 31144 | } | |||
| 31145 | ||||
| 31146 | // i64 SRA needs to be performed as partial shifts. | |||
| 31147 | if (((!Subtarget.hasXOP() && VT == MVT::v2i64) || | |||
| 31148 | (Subtarget.hasInt256() && VT == MVT::v4i64)) && | |||
| 31149 | Op.getOpcode() == ISD::SRA) | |||
| 31150 | return ArithmeticShiftRight64(ShiftAmt); | |||
| 31151 | ||||
| 31152 | if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) || | |||
| 31153 | (Subtarget.hasBWI() && VT == MVT::v64i8)) { | |||
| 31154 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 31155 | MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 31156 | ||||
| 31157 | // Simple i8 add case | |||
| 31158 | if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1) { | |||
| 31159 | // R may be undef at run-time, but (shl R, 1) must be an even number (LSB | |||
| 31160 | // must be 0). (add undef, undef) however can be any value. To make this | |||
| 31161 | // safe, we must freeze R to ensure that register allocation uses the same | |||
| 31162 | // register for an undefined value. This ensures that the result will | |||
| 31163 | // still be even and preserves the original semantics. | |||
| 31164 | R = DAG.getFreeze(R); | |||
| 31165 | return DAG.getNode(ISD::ADD, dl, VT, R, R); | |||
| 31166 | } | |||
| 31167 | ||||
| 31168 | // ashr(R, 7) === cmp_slt(R, 0) | |||
| 31169 | if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) { | |||
| 31170 | SDValue Zeros = DAG.getConstant(0, dl, VT); | |||
| 31171 | if (VT.is512BitVector()) { | |||
| 31172 | assert(VT == MVT::v64i8 && "Unexpected element type!")(static_cast <bool> (VT == MVT::v64i8 && "Unexpected element type!" ) ? void (0) : __assert_fail ("VT == MVT::v64i8 && \"Unexpected element type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31172, __extension__ __PRETTY_FUNCTION__)); | |||
| 31173 | SDValue CMP = DAG.getSetCC(dl, MVT::v64i1, Zeros, R, ISD::SETGT); | |||
| 31174 | return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP); | |||
| 31175 | } | |||
| 31176 | return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R); | |||
| 31177 | } | |||
| 31178 | ||||
| 31179 | // XOP can shift v16i8 directly instead of as shift v8i16 + mask. | |||
| 31180 | if (VT == MVT::v16i8 && Subtarget.hasXOP()) | |||
| 31181 | return SDValue(); | |||
| 31182 | ||||
| 31183 | if (Op.getOpcode() == ISD::SHL) { | |||
| 31184 | // Make a large shift. | |||
| 31185 | SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT, R, | |||
| 31186 | ShiftAmt, DAG); | |||
| 31187 | SHL = DAG.getBitcast(VT, SHL); | |||
| 31188 | // Zero out the rightmost bits. | |||
| 31189 | APInt Mask = APInt::getHighBitsSet(8, 8 - ShiftAmt); | |||
| 31190 | return DAG.getNode(ISD::AND, dl, VT, SHL, DAG.getConstant(Mask, dl, VT)); | |||
| 31191 | } | |||
| 31192 | if (Op.getOpcode() == ISD::SRL) { | |||
| 31193 | // Make a large shift. | |||
| 31194 | SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT, R, | |||
| 31195 | ShiftAmt, DAG); | |||
| 31196 | SRL = DAG.getBitcast(VT, SRL); | |||
| 31197 | // Zero out the leftmost bits. | |||
| 31198 | APInt Mask = APInt::getLowBitsSet(8, 8 - ShiftAmt); | |||
| 31199 | return DAG.getNode(ISD::AND, dl, VT, SRL, DAG.getConstant(Mask, dl, VT)); | |||
| 31200 | } | |||
| 31201 | if (Op.getOpcode() == ISD::SRA) { | |||
| 31202 | // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask) | |||
| 31203 | SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt); | |||
| 31204 | ||||
| 31205 | SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT); | |||
| 31206 | Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask); | |||
| 31207 | Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask); | |||
| 31208 | return Res; | |||
| 31209 | } | |||
| 31210 | llvm_unreachable("Unknown shift opcode.")::llvm::llvm_unreachable_internal("Unknown shift opcode.", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 31210); | |||
| 31211 | } | |||
| 31212 | ||||
| 31213 | return SDValue(); | |||
| 31214 | } | |||
| 31215 | ||||
| 31216 | static SDValue LowerShiftByScalarVariable(SDValue Op, SelectionDAG &DAG, | |||
| 31217 | const X86Subtarget &Subtarget) { | |||
| 31218 | MVT VT = Op.getSimpleValueType(); | |||
| 31219 | SDLoc dl(Op); | |||
| 31220 | SDValue R = Op.getOperand(0); | |||
| 31221 | SDValue Amt = Op.getOperand(1); | |||
| 31222 | unsigned Opcode = Op.getOpcode(); | |||
| 31223 | unsigned X86OpcI = getTargetVShiftUniformOpcode(Opcode, false); | |||
| 31224 | ||||
| 31225 | int BaseShAmtIdx = -1; | |||
| 31226 | if (SDValue BaseShAmt = DAG.getSplatSourceVector(Amt, BaseShAmtIdx)) { | |||
| 31227 | if (supportedVectorShiftWithBaseAmnt(VT, Subtarget, Opcode)) | |||
| 31228 | return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, BaseShAmtIdx, | |||
| 31229 | Subtarget, DAG); | |||
| 31230 | ||||
| 31231 | // vXi8 shifts - shift as v8i16 + mask result. | |||
| 31232 | if (((VT == MVT::v16i8 && !Subtarget.canExtendTo512DQ()) || | |||
| 31233 | (VT == MVT::v32i8 && !Subtarget.canExtendTo512BW()) || | |||
| 31234 | VT == MVT::v64i8) && | |||
| 31235 | !Subtarget.hasXOP()) { | |||
| 31236 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 31237 | MVT ExtVT = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 31238 | if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, Opcode)) { | |||
| 31239 | unsigned LogicalOp = (Opcode == ISD::SHL ? ISD::SHL : ISD::SRL); | |||
| 31240 | unsigned LogicalX86Op = getTargetVShiftUniformOpcode(LogicalOp, false); | |||
| 31241 | ||||
| 31242 | // Create the mask using vXi16 shifts. For shift-rights we need to move | |||
| 31243 | // the upper byte down before splatting the vXi8 mask. | |||
| 31244 | SDValue BitMask = DAG.getConstant(-1, dl, ExtVT); | |||
| 31245 | BitMask = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, BitMask, | |||
| 31246 | BaseShAmt, BaseShAmtIdx, Subtarget, DAG); | |||
| 31247 | if (Opcode != ISD::SHL) | |||
| 31248 | BitMask = getTargetVShiftByConstNode(LogicalX86Op, dl, ExtVT, BitMask, | |||
| 31249 | 8, DAG); | |||
| 31250 | BitMask = DAG.getBitcast(VT, BitMask); | |||
| 31251 | BitMask = DAG.getVectorShuffle(VT, dl, BitMask, BitMask, | |||
| 31252 | SmallVector<int, 64>(NumElts, 0)); | |||
| 31253 | ||||
| 31254 | SDValue Res = getTargetVShiftNode(LogicalX86Op, dl, ExtVT, | |||
| 31255 | DAG.getBitcast(ExtVT, R), BaseShAmt, | |||
| 31256 | BaseShAmtIdx, Subtarget, DAG); | |||
| 31257 | Res = DAG.getBitcast(VT, Res); | |||
| 31258 | Res = DAG.getNode(ISD::AND, dl, VT, Res, BitMask); | |||
| 31259 | ||||
| 31260 | if (Opcode == ISD::SRA) { | |||
| 31261 | // ashr(R, Amt) === sub(xor(lshr(R, Amt), SignMask), SignMask) | |||
| 31262 | // SignMask = lshr(SignBit, Amt) - safe to do this with PSRLW. | |||
| 31263 | SDValue SignMask = DAG.getConstant(0x8080, dl, ExtVT); | |||
| 31264 | SignMask = | |||
| 31265 | getTargetVShiftNode(LogicalX86Op, dl, ExtVT, SignMask, BaseShAmt, | |||
| 31266 | BaseShAmtIdx, Subtarget, DAG); | |||
| 31267 | SignMask = DAG.getBitcast(VT, SignMask); | |||
| 31268 | Res = DAG.getNode(ISD::XOR, dl, VT, Res, SignMask); | |||
| 31269 | Res = DAG.getNode(ISD::SUB, dl, VT, Res, SignMask); | |||
| 31270 | } | |||
| 31271 | return Res; | |||
| 31272 | } | |||
| 31273 | } | |||
| 31274 | } | |||
| 31275 | ||||
| 31276 | return SDValue(); | |||
| 31277 | } | |||
| 31278 | ||||
| 31279 | // Convert a shift/rotate left amount to a multiplication scale factor. | |||
| 31280 | static SDValue convertShiftLeftToScale(SDValue Amt, const SDLoc &dl, | |||
| 31281 | const X86Subtarget &Subtarget, | |||
| 31282 | SelectionDAG &DAG) { | |||
| 31283 | MVT VT = Amt.getSimpleValueType(); | |||
| 31284 | if (!(VT == MVT::v8i16 || VT == MVT::v4i32 || | |||
| 31285 | (Subtarget.hasInt256() && VT == MVT::v16i16) || | |||
| 31286 | (Subtarget.hasAVX512() && VT == MVT::v32i16) || | |||
| 31287 | (!Subtarget.hasAVX512() && VT == MVT::v16i8) || | |||
| 31288 | (Subtarget.hasInt256() && VT == MVT::v32i8) || | |||
| 31289 | (Subtarget.hasBWI() && VT == MVT::v64i8))) | |||
| 31290 | return SDValue(); | |||
| 31291 | ||||
| 31292 | MVT SVT = VT.getVectorElementType(); | |||
| 31293 | unsigned SVTBits = SVT.getSizeInBits(); | |||
| 31294 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 31295 | ||||
| 31296 | APInt UndefElts; | |||
| 31297 | SmallVector<APInt> EltBits; | |||
| 31298 | if (getTargetConstantBitsFromNode(Amt, SVTBits, UndefElts, EltBits)) { | |||
| 31299 | APInt One(SVTBits, 1); | |||
| 31300 | SmallVector<SDValue> Elts(NumElems, DAG.getUNDEF(SVT)); | |||
| 31301 | for (unsigned I = 0; I != NumElems; ++I) { | |||
| 31302 | if (UndefElts[I] || EltBits[I].uge(SVTBits)) | |||
| 31303 | continue; | |||
| 31304 | uint64_t ShAmt = EltBits[I].getZExtValue(); | |||
| 31305 | Elts[I] = DAG.getConstant(One.shl(ShAmt), dl, SVT); | |||
| 31306 | } | |||
| 31307 | return DAG.getBuildVector(VT, dl, Elts); | |||
| 31308 | } | |||
| 31309 | ||||
| 31310 | // If the target doesn't support variable shifts, use either FP conversion | |||
| 31311 | // or integer multiplication to avoid shifting each element individually. | |||
| 31312 | if (VT == MVT::v4i32) { | |||
| 31313 | Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT)); | |||
| 31314 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, | |||
| 31315 | DAG.getConstant(0x3f800000U, dl, VT)); | |||
| 31316 | Amt = DAG.getBitcast(MVT::v4f32, Amt); | |||
| 31317 | return DAG.getNode(ISD::FP_TO_SINT, dl, VT, Amt); | |||
| 31318 | } | |||
| 31319 | ||||
| 31320 | // AVX2 can more effectively perform this as a zext/trunc to/from v8i32. | |||
| 31321 | if (VT == MVT::v8i16 && !Subtarget.hasAVX2()) { | |||
| 31322 | SDValue Z = DAG.getConstant(0, dl, VT); | |||
| 31323 | SDValue Lo = DAG.getBitcast(MVT::v4i32, getUnpackl(DAG, dl, VT, Amt, Z)); | |||
| 31324 | SDValue Hi = DAG.getBitcast(MVT::v4i32, getUnpackh(DAG, dl, VT, Amt, Z)); | |||
| 31325 | Lo = convertShiftLeftToScale(Lo, dl, Subtarget, DAG); | |||
| 31326 | Hi = convertShiftLeftToScale(Hi, dl, Subtarget, DAG); | |||
| 31327 | if (Subtarget.hasSSE41()) | |||
| 31328 | return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); | |||
| 31329 | return getPack(DAG, Subtarget, dl, VT, Lo, Hi); | |||
| 31330 | } | |||
| 31331 | ||||
| 31332 | return SDValue(); | |||
| 31333 | } | |||
| 31334 | ||||
| 31335 | static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, | |||
| 31336 | SelectionDAG &DAG) { | |||
| 31337 | MVT VT = Op.getSimpleValueType(); | |||
| 31338 | SDLoc dl(Op); | |||
| 31339 | SDValue R = Op.getOperand(0); | |||
| 31340 | SDValue Amt = Op.getOperand(1); | |||
| 31341 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 31342 | bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); | |||
| 31343 | ||||
| 31344 | unsigned Opc = Op.getOpcode(); | |||
| 31345 | unsigned X86OpcV = getTargetVShiftUniformOpcode(Opc, true); | |||
| 31346 | unsigned X86OpcI = getTargetVShiftUniformOpcode(Opc, false); | |||
| 31347 | ||||
| 31348 | assert(VT.isVector() && "Custom lowering only for vector shifts!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector shifts!" ) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector shifts!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31348, __extension__ __PRETTY_FUNCTION__)); | |||
| 31349 | assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Only custom lower when we have SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Only custom lower when we have SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31349, __extension__ __PRETTY_FUNCTION__)); | |||
| 31350 | ||||
| 31351 | if (SDValue V = LowerShiftByScalarImmediate(Op, DAG, Subtarget)) | |||
| 31352 | return V; | |||
| 31353 | ||||
| 31354 | if (SDValue V = LowerShiftByScalarVariable(Op, DAG, Subtarget)) | |||
| 31355 | return V; | |||
| 31356 | ||||
| 31357 | if (supportedVectorVarShift(VT, Subtarget, Opc)) | |||
| 31358 | return Op; | |||
| 31359 | ||||
| 31360 | // i64 vector arithmetic shift can be emulated with the transform: | |||
| 31361 | // M = lshr(SIGN_MASK, Amt) | |||
| 31362 | // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M) | |||
| 31363 | if (((VT == MVT::v2i64 && !Subtarget.hasXOP()) || | |||
| 31364 | (VT == MVT::v4i64 && Subtarget.hasInt256())) && | |||
| 31365 | Opc == ISD::SRA) { | |||
| 31366 | SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT); | |||
| 31367 | SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt); | |||
| 31368 | R = DAG.getNode(ISD::SRL, dl, VT, R, Amt); | |||
| 31369 | R = DAG.getNode(ISD::XOR, dl, VT, R, M); | |||
| 31370 | R = DAG.getNode(ISD::SUB, dl, VT, R, M); | |||
| 31371 | return R; | |||
| 31372 | } | |||
| 31373 | ||||
| 31374 | // XOP has 128-bit variable logical/arithmetic shifts. | |||
| 31375 | // +ve/-ve Amt = shift left/right. | |||
| 31376 | if (Subtarget.hasXOP() && (VT == MVT::v2i64 || VT == MVT::v4i32 || | |||
| 31377 | VT == MVT::v8i16 || VT == MVT::v16i8)) { | |||
| 31378 | if (Opc == ISD::SRL || Opc == ISD::SRA) { | |||
| 31379 | SDValue Zero = DAG.getConstant(0, dl, VT); | |||
| 31380 | Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt); | |||
| 31381 | } | |||
| 31382 | if (Opc == ISD::SHL || Opc == ISD::SRL) | |||
| 31383 | return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt); | |||
| 31384 | if (Opc == ISD::SRA) | |||
| 31385 | return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt); | |||
| 31386 | } | |||
| 31387 | ||||
| 31388 | // 2i64 vector logical shifts can efficiently avoid scalarization - do the | |||
| 31389 | // shifts per-lane and then shuffle the partial results back together. | |||
| 31390 | if (VT == MVT::v2i64 && Opc != ISD::SRA) { | |||
| 31391 | // Splat the shift amounts so the scalar shifts above will catch it. | |||
| 31392 | SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0}); | |||
| 31393 | SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1}); | |||
| 31394 | SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0); | |||
| 31395 | SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1); | |||
| 31396 | return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3}); | |||
| 31397 | } | |||
| 31398 | ||||
| 31399 | // If possible, lower this shift as a sequence of two shifts by | |||
| 31400 | // constant plus a BLENDing shuffle instead of scalarizing it. | |||
| 31401 | // Example: | |||
| 31402 | // (v4i32 (srl A, (build_vector < X, Y, Y, Y>))) | |||
| 31403 | // | |||
| 31404 | // Could be rewritten as: | |||
| 31405 | // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>))) | |||
| 31406 | // | |||
| 31407 | // The advantage is that the two shifts from the example would be | |||
| 31408 | // lowered as X86ISD::VSRLI nodes in parallel before blending. | |||
| 31409 | if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32 || | |||
| 31410 | (VT == MVT::v16i16 && Subtarget.hasInt256()))) { | |||
| 31411 | SDValue Amt1, Amt2; | |||
| 31412 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 31413 | SmallVector<int, 8> ShuffleMask; | |||
| 31414 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 31415 | SDValue A = Amt->getOperand(i); | |||
| 31416 | if (A.isUndef()) { | |||
| 31417 | ShuffleMask.push_back(SM_SentinelUndef); | |||
| 31418 | continue; | |||
| 31419 | } | |||
| 31420 | if (!Amt1 || Amt1 == A) { | |||
| 31421 | ShuffleMask.push_back(i); | |||
| 31422 | Amt1 = A; | |||
| 31423 | continue; | |||
| 31424 | } | |||
| 31425 | if (!Amt2 || Amt2 == A) { | |||
| 31426 | ShuffleMask.push_back(i + NumElts); | |||
| 31427 | Amt2 = A; | |||
| 31428 | continue; | |||
| 31429 | } | |||
| 31430 | break; | |||
| 31431 | } | |||
| 31432 | ||||
| 31433 | // Only perform this blend if we can perform it without loading a mask. | |||
| 31434 | if (ShuffleMask.size() == NumElts && Amt1 && Amt2 && | |||
| 31435 | (VT != MVT::v16i16 || | |||
| 31436 | is128BitLaneRepeatedShuffleMask(VT, ShuffleMask)) && | |||
| 31437 | (VT == MVT::v4i32 || Subtarget.hasSSE41() || Opc != ISD::SHL || | |||
| 31438 | canWidenShuffleElements(ShuffleMask))) { | |||
| 31439 | auto *Cst1 = dyn_cast<ConstantSDNode>(Amt1); | |||
| 31440 | auto *Cst2 = dyn_cast<ConstantSDNode>(Amt2); | |||
| 31441 | if (Cst1 && Cst2 && Cst1->getAPIntValue().ult(EltSizeInBits) && | |||
| 31442 | Cst2->getAPIntValue().ult(EltSizeInBits)) { | |||
| 31443 | SDValue Shift1 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, | |||
| 31444 | Cst1->getZExtValue(), DAG); | |||
| 31445 | SDValue Shift2 = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, | |||
| 31446 | Cst2->getZExtValue(), DAG); | |||
| 31447 | return DAG.getVectorShuffle(VT, dl, Shift1, Shift2, ShuffleMask); | |||
| 31448 | } | |||
| 31449 | } | |||
| 31450 | } | |||
| 31451 | ||||
| 31452 | // If possible, lower this packed shift into a vector multiply instead of | |||
| 31453 | // expanding it into a sequence of scalar shifts. | |||
| 31454 | // For v32i8 cases, it might be quicker to split/extend to vXi16 shifts. | |||
| 31455 | if (Opc == ISD::SHL && !(VT == MVT::v32i8 && (Subtarget.hasXOP() || | |||
| 31456 | Subtarget.canExtendTo512BW()))) | |||
| 31457 | if (SDValue Scale = convertShiftLeftToScale(Amt, dl, Subtarget, DAG)) | |||
| 31458 | return DAG.getNode(ISD::MUL, dl, VT, R, Scale); | |||
| 31459 | ||||
| 31460 | // Constant ISD::SRL can be performed efficiently on vXi16 vectors as we | |||
| 31461 | // can replace with ISD::MULHU, creating scale factor from (NumEltBits - Amt). | |||
| 31462 | if (Opc == ISD::SRL && ConstantAmt && | |||
| 31463 | (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256()))) { | |||
| 31464 | SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); | |||
| 31465 | SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); | |||
| 31466 | if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { | |||
| 31467 | SDValue Zero = DAG.getConstant(0, dl, VT); | |||
| 31468 | SDValue ZAmt = DAG.getSetCC(dl, VT, Amt, Zero, ISD::SETEQ); | |||
| 31469 | SDValue Res = DAG.getNode(ISD::MULHU, dl, VT, R, Scale); | |||
| 31470 | return DAG.getSelect(dl, VT, ZAmt, R, Res); | |||
| 31471 | } | |||
| 31472 | } | |||
| 31473 | ||||
| 31474 | // Constant ISD::SRA can be performed efficiently on vXi16 vectors as we | |||
| 31475 | // can replace with ISD::MULHS, creating scale factor from (NumEltBits - Amt). | |||
| 31476 | // TODO: Special case handling for shift by 0/1, really we can afford either | |||
| 31477 | // of these cases in pre-SSE41/XOP/AVX512 but not both. | |||
| 31478 | if (Opc == ISD::SRA && ConstantAmt && | |||
| 31479 | (VT == MVT::v8i16 || (VT == MVT::v16i16 && Subtarget.hasInt256())) && | |||
| 31480 | ((Subtarget.hasSSE41() && !Subtarget.hasXOP() && | |||
| 31481 | !Subtarget.hasAVX512()) || | |||
| 31482 | DAG.isKnownNeverZero(Amt))) { | |||
| 31483 | SDValue EltBits = DAG.getConstant(EltSizeInBits, dl, VT); | |||
| 31484 | SDValue RAmt = DAG.getNode(ISD::SUB, dl, VT, EltBits, Amt); | |||
| 31485 | if (SDValue Scale = convertShiftLeftToScale(RAmt, dl, Subtarget, DAG)) { | |||
| 31486 | SDValue Amt0 = | |||
| 31487 | DAG.getSetCC(dl, VT, Amt, DAG.getConstant(0, dl, VT), ISD::SETEQ); | |||
| 31488 | SDValue Amt1 = | |||
| 31489 | DAG.getSetCC(dl, VT, Amt, DAG.getConstant(1, dl, VT), ISD::SETEQ); | |||
| 31490 | SDValue Sra1 = | |||
| 31491 | getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, R, 1, DAG); | |||
| 31492 | SDValue Res = DAG.getNode(ISD::MULHS, dl, VT, R, Scale); | |||
| 31493 | Res = DAG.getSelect(dl, VT, Amt0, R, Res); | |||
| 31494 | return DAG.getSelect(dl, VT, Amt1, Sra1, Res); | |||
| 31495 | } | |||
| 31496 | } | |||
| 31497 | ||||
| 31498 | // v4i32 Non Uniform Shifts. | |||
| 31499 | // If the shift amount is constant we can shift each lane using the SSE2 | |||
| 31500 | // immediate shifts, else we need to zero-extend each lane to the lower i64 | |||
| 31501 | // and shift using the SSE2 variable shifts. | |||
| 31502 | // The separate results can then be blended together. | |||
| 31503 | if (VT == MVT::v4i32) { | |||
| 31504 | SDValue Amt0, Amt1, Amt2, Amt3; | |||
| 31505 | if (ConstantAmt) { | |||
| 31506 | Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0}); | |||
| 31507 | Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1}); | |||
| 31508 | Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2}); | |||
| 31509 | Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3}); | |||
| 31510 | } else { | |||
| 31511 | // The SSE2 shifts use the lower i64 as the same shift amount for | |||
| 31512 | // all lanes and the upper i64 is ignored. On AVX we're better off | |||
| 31513 | // just zero-extending, but for SSE just duplicating the top 16-bits is | |||
| 31514 | // cheaper and has the same effect for out of range values. | |||
| 31515 | if (Subtarget.hasAVX()) { | |||
| 31516 | SDValue Z = DAG.getConstant(0, dl, VT); | |||
| 31517 | Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1}); | |||
| 31518 | Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1}); | |||
| 31519 | Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1}); | |||
| 31520 | Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1}); | |||
| 31521 | } else { | |||
| 31522 | SDValue Amt01 = DAG.getBitcast(MVT::v8i16, Amt); | |||
| 31523 | SDValue Amt23 = DAG.getVectorShuffle(MVT::v8i16, dl, Amt01, Amt01, | |||
| 31524 | {4, 5, 6, 7, -1, -1, -1, -1}); | |||
| 31525 | SDValue Msk02 = getV4X86ShuffleImm8ForMask({0, 1, 1, 1}, dl, DAG); | |||
| 31526 | SDValue Msk13 = getV4X86ShuffleImm8ForMask({2, 3, 3, 3}, dl, DAG); | |||
| 31527 | Amt0 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk02); | |||
| 31528 | Amt1 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt01, Msk13); | |||
| 31529 | Amt2 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk02); | |||
| 31530 | Amt3 = DAG.getNode(X86ISD::PSHUFLW, dl, MVT::v8i16, Amt23, Msk13); | |||
| 31531 | } | |||
| 31532 | } | |||
| 31533 | ||||
| 31534 | unsigned ShOpc = ConstantAmt ? Opc : X86OpcV; | |||
| 31535 | SDValue R0 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt0)); | |||
| 31536 | SDValue R1 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt1)); | |||
| 31537 | SDValue R2 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt2)); | |||
| 31538 | SDValue R3 = DAG.getNode(ShOpc, dl, VT, R, DAG.getBitcast(VT, Amt3)); | |||
| 31539 | ||||
| 31540 | // Merge the shifted lane results optimally with/without PBLENDW. | |||
| 31541 | // TODO - ideally shuffle combining would handle this. | |||
| 31542 | if (Subtarget.hasSSE41()) { | |||
| 31543 | SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1}); | |||
| 31544 | SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7}); | |||
| 31545 | return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7}); | |||
| 31546 | } | |||
| 31547 | SDValue R01 = DAG.getVectorShuffle(VT, dl, R0, R1, {0, -1, -1, 5}); | |||
| 31548 | SDValue R23 = DAG.getVectorShuffle(VT, dl, R2, R3, {2, -1, -1, 7}); | |||
| 31549 | return DAG.getVectorShuffle(VT, dl, R01, R23, {0, 3, 4, 7}); | |||
| 31550 | } | |||
| 31551 | ||||
| 31552 | // It's worth extending once and using the vXi16/vXi32 shifts for smaller | |||
| 31553 | // types, but without AVX512 the extra overheads to get from vXi8 to vXi32 | |||
| 31554 | // make the existing SSE solution better. | |||
| 31555 | // NOTE: We honor prefered vector width before promoting to 512-bits. | |||
| 31556 | if ((Subtarget.hasInt256() && VT == MVT::v8i16) || | |||
| 31557 | (Subtarget.canExtendTo512DQ() && VT == MVT::v16i16) || | |||
| 31558 | (Subtarget.canExtendTo512DQ() && VT == MVT::v16i8) || | |||
| 31559 | (Subtarget.canExtendTo512BW() && VT == MVT::v32i8) || | |||
| 31560 | (Subtarget.hasBWI() && Subtarget.hasVLX() && VT == MVT::v16i8)) { | |||
| 31561 | assert((!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) &&(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT:: v32i8 || VT == MVT::v16i8) && "Unexpected vector type" ) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__ __PRETTY_FUNCTION__)) | |||
| 31562 | "Unexpected vector type")(static_cast <bool> ((!Subtarget.hasBWI() || VT == MVT:: v32i8 || VT == MVT::v16i8) && "Unexpected vector type" ) ? void (0) : __assert_fail ("(!Subtarget.hasBWI() || VT == MVT::v32i8 || VT == MVT::v16i8) && \"Unexpected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31562, __extension__ __PRETTY_FUNCTION__)); | |||
| 31563 | MVT EvtSVT = Subtarget.hasBWI() ? MVT::i16 : MVT::i32; | |||
| 31564 | MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements()); | |||
| 31565 | unsigned ExtOpc = Opc == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 31566 | R = DAG.getNode(ExtOpc, dl, ExtVT, R); | |||
| 31567 | Amt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Amt); | |||
| 31568 | return DAG.getNode(ISD::TRUNCATE, dl, VT, | |||
| 31569 | DAG.getNode(Opc, dl, ExtVT, R, Amt)); | |||
| 31570 | } | |||
| 31571 | ||||
| 31572 | // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we | |||
| 31573 | // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI. | |||
| 31574 | if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) && | |||
| 31575 | (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || | |||
| 31576 | (VT == MVT::v64i8 && Subtarget.hasBWI())) && | |||
| 31577 | !Subtarget.hasXOP()) { | |||
| 31578 | int NumElts = VT.getVectorNumElements(); | |||
| 31579 | SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8); | |||
| 31580 | ||||
| 31581 | // Extend constant shift amount to vXi16 (it doesn't matter if the type | |||
| 31582 | // isn't legal). | |||
| 31583 | MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts); | |||
| 31584 | Amt = DAG.getZExtOrTrunc(Amt, dl, ExVT); | |||
| 31585 | Amt = DAG.getNode(ISD::SUB, dl, ExVT, DAG.getConstant(8, dl, ExVT), Amt); | |||
| 31586 | Amt = DAG.getNode(ISD::SHL, dl, ExVT, DAG.getConstant(1, dl, ExVT), Amt); | |||
| 31587 | assert(ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes (Amt.getNode()) && "Constant build vector expected") ? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__ __PRETTY_FUNCTION__)) | |||
| 31588 | "Constant build vector expected")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes (Amt.getNode()) && "Constant build vector expected") ? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()) && \"Constant build vector expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31588, __extension__ __PRETTY_FUNCTION__)); | |||
| 31589 | ||||
| 31590 | if (VT == MVT::v16i8 && Subtarget.hasInt256()) { | |||
| 31591 | R = Opc == ISD::SRA ? DAG.getSExtOrTrunc(R, dl, ExVT) | |||
| 31592 | : DAG.getZExtOrTrunc(R, dl, ExVT); | |||
| 31593 | R = DAG.getNode(ISD::MUL, dl, ExVT, R, Amt); | |||
| 31594 | R = DAG.getNode(X86ISD::VSRLI, dl, ExVT, R, Cst8); | |||
| 31595 | return DAG.getZExtOrTrunc(R, dl, VT); | |||
| 31596 | } | |||
| 31597 | ||||
| 31598 | SmallVector<SDValue, 16> LoAmt, HiAmt; | |||
| 31599 | for (int i = 0; i != NumElts; i += 16) { | |||
| 31600 | for (int j = 0; j != 8; ++j) { | |||
| 31601 | LoAmt.push_back(Amt.getOperand(i + j)); | |||
| 31602 | HiAmt.push_back(Amt.getOperand(i + j + 8)); | |||
| 31603 | } | |||
| 31604 | } | |||
| 31605 | ||||
| 31606 | MVT VT16 = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 31607 | SDValue LoA = DAG.getBuildVector(VT16, dl, LoAmt); | |||
| 31608 | SDValue HiA = DAG.getBuildVector(VT16, dl, HiAmt); | |||
| 31609 | ||||
| 31610 | SDValue LoR = DAG.getBitcast(VT16, getUnpackl(DAG, dl, VT, R, R)); | |||
| 31611 | SDValue HiR = DAG.getBitcast(VT16, getUnpackh(DAG, dl, VT, R, R)); | |||
| 31612 | LoR = DAG.getNode(X86OpcI, dl, VT16, LoR, Cst8); | |||
| 31613 | HiR = DAG.getNode(X86OpcI, dl, VT16, HiR, Cst8); | |||
| 31614 | LoR = DAG.getNode(ISD::MUL, dl, VT16, LoR, LoA); | |||
| 31615 | HiR = DAG.getNode(ISD::MUL, dl, VT16, HiR, HiA); | |||
| 31616 | LoR = DAG.getNode(X86ISD::VSRLI, dl, VT16, LoR, Cst8); | |||
| 31617 | HiR = DAG.getNode(X86ISD::VSRLI, dl, VT16, HiR, Cst8); | |||
| 31618 | return DAG.getNode(X86ISD::PACKUS, dl, VT, LoR, HiR); | |||
| 31619 | } | |||
| 31620 | ||||
| 31621 | if (VT == MVT::v16i8 || | |||
| 31622 | (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) || | |||
| 31623 | (VT == MVT::v64i8 && Subtarget.hasBWI())) { | |||
| 31624 | MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2); | |||
| 31625 | ||||
| 31626 | auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { | |||
| 31627 | if (VT.is512BitVector()) { | |||
| 31628 | // On AVX512BW targets we make use of the fact that VSELECT lowers | |||
| 31629 | // to a masked blend which selects bytes based just on the sign bit | |||
| 31630 | // extracted to a mask. | |||
| 31631 | MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); | |||
| 31632 | V0 = DAG.getBitcast(VT, V0); | |||
| 31633 | V1 = DAG.getBitcast(VT, V1); | |||
| 31634 | Sel = DAG.getBitcast(VT, Sel); | |||
| 31635 | Sel = DAG.getSetCC(dl, MaskVT, DAG.getConstant(0, dl, VT), Sel, | |||
| 31636 | ISD::SETGT); | |||
| 31637 | return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); | |||
| 31638 | } else if (Subtarget.hasSSE41()) { | |||
| 31639 | // On SSE41 targets we can use PBLENDVB which selects bytes based just | |||
| 31640 | // on the sign bit. | |||
| 31641 | V0 = DAG.getBitcast(VT, V0); | |||
| 31642 | V1 = DAG.getBitcast(VT, V1); | |||
| 31643 | Sel = DAG.getBitcast(VT, Sel); | |||
| 31644 | return DAG.getBitcast(SelVT, | |||
| 31645 | DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1)); | |||
| 31646 | } | |||
| 31647 | // On pre-SSE41 targets we test for the sign bit by comparing to | |||
| 31648 | // zero - a negative value will set all bits of the lanes to true | |||
| 31649 | // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. | |||
| 31650 | SDValue Z = DAG.getConstant(0, dl, SelVT); | |||
| 31651 | SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); | |||
| 31652 | return DAG.getSelect(dl, SelVT, C, V0, V1); | |||
| 31653 | }; | |||
| 31654 | ||||
| 31655 | // Turn 'a' into a mask suitable for VSELECT: a = a << 5; | |||
| 31656 | // We can safely do this using i16 shifts as we're only interested in | |||
| 31657 | // the 3 lower bits of each byte. | |||
| 31658 | Amt = DAG.getBitcast(ExtVT, Amt); | |||
| 31659 | Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ExtVT, Amt, 5, DAG); | |||
| 31660 | Amt = DAG.getBitcast(VT, Amt); | |||
| 31661 | ||||
| 31662 | if (Opc == ISD::SHL || Opc == ISD::SRL) { | |||
| 31663 | // r = VSELECT(r, shift(r, 4), a); | |||
| 31664 | SDValue M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(4, dl, VT)); | |||
| 31665 | R = SignBitSelect(VT, Amt, M, R); | |||
| 31666 | ||||
| 31667 | // a += a | |||
| 31668 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); | |||
| 31669 | ||||
| 31670 | // r = VSELECT(r, shift(r, 2), a); | |||
| 31671 | M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(2, dl, VT)); | |||
| 31672 | R = SignBitSelect(VT, Amt, M, R); | |||
| 31673 | ||||
| 31674 | // a += a | |||
| 31675 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); | |||
| 31676 | ||||
| 31677 | // return VSELECT(r, shift(r, 1), a); | |||
| 31678 | M = DAG.getNode(Opc, dl, VT, R, DAG.getConstant(1, dl, VT)); | |||
| 31679 | R = SignBitSelect(VT, Amt, M, R); | |||
| 31680 | return R; | |||
| 31681 | } | |||
| 31682 | ||||
| 31683 | if (Opc == ISD::SRA) { | |||
| 31684 | // For SRA we need to unpack each byte to the higher byte of a i16 vector | |||
| 31685 | // so we can correctly sign extend. We don't care what happens to the | |||
| 31686 | // lower byte. | |||
| 31687 | SDValue ALo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), Amt); | |||
| 31688 | SDValue AHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), Amt); | |||
| 31689 | SDValue RLo = getUnpackl(DAG, dl, VT, DAG.getUNDEF(VT), R); | |||
| 31690 | SDValue RHi = getUnpackh(DAG, dl, VT, DAG.getUNDEF(VT), R); | |||
| 31691 | ALo = DAG.getBitcast(ExtVT, ALo); | |||
| 31692 | AHi = DAG.getBitcast(ExtVT, AHi); | |||
| 31693 | RLo = DAG.getBitcast(ExtVT, RLo); | |||
| 31694 | RHi = DAG.getBitcast(ExtVT, RHi); | |||
| 31695 | ||||
| 31696 | // r = VSELECT(r, shift(r, 4), a); | |||
| 31697 | SDValue MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 4, DAG); | |||
| 31698 | SDValue MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 4, DAG); | |||
| 31699 | RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); | |||
| 31700 | RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); | |||
| 31701 | ||||
| 31702 | // a += a | |||
| 31703 | ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); | |||
| 31704 | AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); | |||
| 31705 | ||||
| 31706 | // r = VSELECT(r, shift(r, 2), a); | |||
| 31707 | MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 2, DAG); | |||
| 31708 | MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 2, DAG); | |||
| 31709 | RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); | |||
| 31710 | RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); | |||
| 31711 | ||||
| 31712 | // a += a | |||
| 31713 | ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo); | |||
| 31714 | AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi); | |||
| 31715 | ||||
| 31716 | // r = VSELECT(r, shift(r, 1), a); | |||
| 31717 | MLo = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RLo, 1, DAG); | |||
| 31718 | MHi = getTargetVShiftByConstNode(X86OpcI, dl, ExtVT, RHi, 1, DAG); | |||
| 31719 | RLo = SignBitSelect(ExtVT, ALo, MLo, RLo); | |||
| 31720 | RHi = SignBitSelect(ExtVT, AHi, MHi, RHi); | |||
| 31721 | ||||
| 31722 | // Logical shift the result back to the lower byte, leaving a zero upper | |||
| 31723 | // byte meaning that we can safely pack with PACKUSWB. | |||
| 31724 | RLo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RLo, 8, DAG); | |||
| 31725 | RHi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, RHi, 8, DAG); | |||
| 31726 | return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi); | |||
| 31727 | } | |||
| 31728 | } | |||
| 31729 | ||||
| 31730 | if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) { | |||
| 31731 | MVT ExtVT = MVT::v8i32; | |||
| 31732 | SDValue Z = DAG.getConstant(0, dl, VT); | |||
| 31733 | SDValue ALo = getUnpackl(DAG, dl, VT, Amt, Z); | |||
| 31734 | SDValue AHi = getUnpackh(DAG, dl, VT, Amt, Z); | |||
| 31735 | SDValue RLo = getUnpackl(DAG, dl, VT, Z, R); | |||
| 31736 | SDValue RHi = getUnpackh(DAG, dl, VT, Z, R); | |||
| 31737 | ALo = DAG.getBitcast(ExtVT, ALo); | |||
| 31738 | AHi = DAG.getBitcast(ExtVT, AHi); | |||
| 31739 | RLo = DAG.getBitcast(ExtVT, RLo); | |||
| 31740 | RHi = DAG.getBitcast(ExtVT, RHi); | |||
| 31741 | SDValue Lo = DAG.getNode(Opc, dl, ExtVT, RLo, ALo); | |||
| 31742 | SDValue Hi = DAG.getNode(Opc, dl, ExtVT, RHi, AHi); | |||
| 31743 | Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Lo, 16, DAG); | |||
| 31744 | Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExtVT, Hi, 16, DAG); | |||
| 31745 | return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); | |||
| 31746 | } | |||
| 31747 | ||||
| 31748 | if (VT == MVT::v8i16) { | |||
| 31749 | // If we have a constant shift amount, the non-SSE41 path is best as | |||
| 31750 | // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW. | |||
| 31751 | bool UseSSE41 = Subtarget.hasSSE41() && | |||
| 31752 | !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); | |||
| 31753 | ||||
| 31754 | auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) { | |||
| 31755 | // On SSE41 targets we can use PBLENDVB which selects bytes based just on | |||
| 31756 | // the sign bit. | |||
| 31757 | if (UseSSE41) { | |||
| 31758 | MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2); | |||
| 31759 | V0 = DAG.getBitcast(ExtVT, V0); | |||
| 31760 | V1 = DAG.getBitcast(ExtVT, V1); | |||
| 31761 | Sel = DAG.getBitcast(ExtVT, Sel); | |||
| 31762 | return DAG.getBitcast( | |||
| 31763 | VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1)); | |||
| 31764 | } | |||
| 31765 | // On pre-SSE41 targets we splat the sign bit - a negative value will | |||
| 31766 | // set all bits of the lanes to true and VSELECT uses that in | |||
| 31767 | // its OR(AND(V0,C),AND(V1,~C)) lowering. | |||
| 31768 | SDValue C = | |||
| 31769 | getTargetVShiftByConstNode(X86ISD::VSRAI, dl, VT, Sel, 15, DAG); | |||
| 31770 | return DAG.getSelect(dl, VT, C, V0, V1); | |||
| 31771 | }; | |||
| 31772 | ||||
| 31773 | // Turn 'a' into a mask suitable for VSELECT: a = a << 12; | |||
| 31774 | if (UseSSE41) { | |||
| 31775 | // On SSE41 targets we need to replicate the shift mask in both | |||
| 31776 | // bytes for PBLENDVB. | |||
| 31777 | Amt = DAG.getNode( | |||
| 31778 | ISD::OR, dl, VT, | |||
| 31779 | getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 4, DAG), | |||
| 31780 | getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG)); | |||
| 31781 | } else { | |||
| 31782 | Amt = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Amt, 12, DAG); | |||
| 31783 | } | |||
| 31784 | ||||
| 31785 | // r = VSELECT(r, shift(r, 8), a); | |||
| 31786 | SDValue M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 8, DAG); | |||
| 31787 | R = SignBitSelect(Amt, M, R); | |||
| 31788 | ||||
| 31789 | // a += a | |||
| 31790 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); | |||
| 31791 | ||||
| 31792 | // r = VSELECT(r, shift(r, 4), a); | |||
| 31793 | M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 4, DAG); | |||
| 31794 | R = SignBitSelect(Amt, M, R); | |||
| 31795 | ||||
| 31796 | // a += a | |||
| 31797 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); | |||
| 31798 | ||||
| 31799 | // r = VSELECT(r, shift(r, 2), a); | |||
| 31800 | M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 2, DAG); | |||
| 31801 | R = SignBitSelect(Amt, M, R); | |||
| 31802 | ||||
| 31803 | // a += a | |||
| 31804 | Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt); | |||
| 31805 | ||||
| 31806 | // return VSELECT(r, shift(r, 1), a); | |||
| 31807 | M = getTargetVShiftByConstNode(X86OpcI, dl, VT, R, 1, DAG); | |||
| 31808 | R = SignBitSelect(Amt, M, R); | |||
| 31809 | return R; | |||
| 31810 | } | |||
| 31811 | ||||
| 31812 | // Decompose 256-bit shifts into 128-bit shifts. | |||
| 31813 | if (VT.is256BitVector()) | |||
| 31814 | return splitVectorIntBinary(Op, DAG); | |||
| 31815 | ||||
| 31816 | if (VT == MVT::v32i16 || VT == MVT::v64i8) | |||
| 31817 | return splitVectorIntBinary(Op, DAG); | |||
| 31818 | ||||
| 31819 | return SDValue(); | |||
| 31820 | } | |||
| 31821 | ||||
| 31822 | static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget, | |||
| 31823 | SelectionDAG &DAG) { | |||
| 31824 | MVT VT = Op.getSimpleValueType(); | |||
| 31825 | assert((Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) &&(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op .getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!" ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__ __PRETTY_FUNCTION__)) | |||
| 31826 | "Unexpected funnel shift opcode!")(static_cast <bool> ((Op.getOpcode() == ISD::FSHL || Op .getOpcode() == ISD::FSHR) && "Unexpected funnel shift opcode!" ) ? void (0) : __assert_fail ("(Op.getOpcode() == ISD::FSHL || Op.getOpcode() == ISD::FSHR) && \"Unexpected funnel shift opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31826, __extension__ __PRETTY_FUNCTION__)); | |||
| 31827 | ||||
| 31828 | SDLoc DL(Op); | |||
| 31829 | SDValue Op0 = Op.getOperand(0); | |||
| 31830 | SDValue Op1 = Op.getOperand(1); | |||
| 31831 | SDValue Amt = Op.getOperand(2); | |||
| 31832 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 31833 | bool IsFSHR = Op.getOpcode() == ISD::FSHR; | |||
| 31834 | ||||
| 31835 | if (VT.isVector()) { | |||
| 31836 | APInt APIntShiftAmt; | |||
| 31837 | bool IsCstSplat = X86::isConstantSplat(Amt, APIntShiftAmt); | |||
| 31838 | ||||
| 31839 | if (Subtarget.hasVBMI2() && EltSizeInBits > 8) { | |||
| 31840 | if (IsFSHR) | |||
| 31841 | std::swap(Op0, Op1); | |||
| 31842 | ||||
| 31843 | if (IsCstSplat) { | |||
| 31844 | uint64_t ShiftAmt = APIntShiftAmt.urem(EltSizeInBits); | |||
| 31845 | SDValue Imm = DAG.getTargetConstant(ShiftAmt, DL, MVT::i8); | |||
| 31846 | return getAVX512Node(IsFSHR ? X86ISD::VSHRD : X86ISD::VSHLD, DL, VT, | |||
| 31847 | {Op0, Op1, Imm}, DAG, Subtarget); | |||
| 31848 | } | |||
| 31849 | return getAVX512Node(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT, | |||
| 31850 | {Op0, Op1, Amt}, DAG, Subtarget); | |||
| 31851 | } | |||
| 31852 | assert((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__ __PRETTY_FUNCTION__)) | |||
| 31853 | VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 ||(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__ __PRETTY_FUNCTION__)) | |||
| 31854 | VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) &&(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__ __PRETTY_FUNCTION__)) | |||
| 31855 | "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8 || VT == MVT::v8i16 || VT == MVT::v16i16 || VT == MVT::v32i16 || VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31855, __extension__ __PRETTY_FUNCTION__)); | |||
| 31856 | ||||
| 31857 | // fshl(x,y,z) -> unpack(y,x) << (z & (bw-1))) >> bw. | |||
| 31858 | // fshr(x,y,z) -> unpack(y,x) >> (z & (bw-1))). | |||
| 31859 | if (IsCstSplat) | |||
| 31860 | return SDValue(); | |||
| 31861 | ||||
| 31862 | SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); | |||
| 31863 | SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); | |||
| 31864 | bool IsCst = ISD::isBuildVectorOfConstantSDNodes(AmtMod.getNode()); | |||
| 31865 | ||||
| 31866 | // Constant vXi16 funnel shifts can be efficiently handled by default. | |||
| 31867 | if (IsCst && EltSizeInBits == 16) | |||
| 31868 | return SDValue(); | |||
| 31869 | ||||
| 31870 | unsigned ShiftOpc = IsFSHR ? ISD::SRL : ISD::SHL; | |||
| 31871 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 31872 | MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); | |||
| 31873 | MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); | |||
| 31874 | ||||
| 31875 | // Split 256-bit integers on XOP/pre-AVX2 targets. | |||
| 31876 | // Split 512-bit integers on non 512-bit BWI targets. | |||
| 31877 | if ((VT.is256BitVector() && ((Subtarget.hasXOP() && EltSizeInBits < 16) || | |||
| 31878 | !Subtarget.hasAVX2())) || | |||
| 31879 | (VT.is512BitVector() && !Subtarget.useBWIRegs() && | |||
| 31880 | EltSizeInBits < 32)) { | |||
| 31881 | // Pre-mask the amount modulo using the wider vector. | |||
| 31882 | Op = DAG.getNode(Op.getOpcode(), DL, VT, Op0, Op1, AmtMod); | |||
| 31883 | return splitVectorOp(Op, DAG); | |||
| 31884 | } | |||
| 31885 | ||||
| 31886 | // Attempt to fold scalar shift as unpack(y,x) << zext(splat(z)) | |||
| 31887 | if (supportedVectorShiftWithBaseAmnt(ExtVT, Subtarget, ShiftOpc)) { | |||
| 31888 | int ScalarAmtIdx = -1; | |||
| 31889 | if (SDValue ScalarAmt = DAG.getSplatSourceVector(AmtMod, ScalarAmtIdx)) { | |||
| 31890 | // Uniform vXi16 funnel shifts can be efficiently handled by default. | |||
| 31891 | if (EltSizeInBits == 16) | |||
| 31892 | return SDValue(); | |||
| 31893 | ||||
| 31894 | SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); | |||
| 31895 | SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); | |||
| 31896 | Lo = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Lo, ScalarAmt, | |||
| 31897 | ScalarAmtIdx, Subtarget, DAG); | |||
| 31898 | Hi = getTargetVShiftNode(ShiftOpc, DL, ExtVT, Hi, ScalarAmt, | |||
| 31899 | ScalarAmtIdx, Subtarget, DAG); | |||
| 31900 | return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); | |||
| 31901 | } | |||
| 31902 | } | |||
| 31903 | ||||
| 31904 | MVT WideSVT = MVT::getIntegerVT( | |||
| 31905 | std::min<unsigned>(EltSizeInBits * 2, Subtarget.hasBWI() ? 16 : 32)); | |||
| 31906 | MVT WideVT = MVT::getVectorVT(WideSVT, NumElts); | |||
| 31907 | ||||
| 31908 | // If per-element shifts are legal, fallback to generic expansion. | |||
| 31909 | if (supportedVectorVarShift(VT, Subtarget, ShiftOpc) || Subtarget.hasXOP()) | |||
| 31910 | return SDValue(); | |||
| 31911 | ||||
| 31912 | // Attempt to fold as: | |||
| 31913 | // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. | |||
| 31914 | // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). | |||
| 31915 | if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && | |||
| 31916 | supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { | |||
| 31917 | Op0 = DAG.getNode(ISD::ANY_EXTEND, DL, WideVT, Op0); | |||
| 31918 | Op1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, Op1); | |||
| 31919 | AmtMod = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); | |||
| 31920 | Op0 = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, Op0, | |||
| 31921 | EltSizeInBits, DAG); | |||
| 31922 | SDValue Res = DAG.getNode(ISD::OR, DL, WideVT, Op0, Op1); | |||
| 31923 | Res = DAG.getNode(ShiftOpc, DL, WideVT, Res, AmtMod); | |||
| 31924 | if (!IsFSHR) | |||
| 31925 | Res = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, Res, | |||
| 31926 | EltSizeInBits, DAG); | |||
| 31927 | return DAG.getNode(ISD::TRUNCATE, DL, VT, Res); | |||
| 31928 | } | |||
| 31929 | ||||
| 31930 | // Attempt to fold per-element (ExtVT) shift as unpack(y,x) << zext(z) | |||
| 31931 | if (((IsCst || !Subtarget.hasAVX512()) && !IsFSHR && EltSizeInBits <= 16) || | |||
| 31932 | supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { | |||
| 31933 | SDValue Z = DAG.getConstant(0, DL, VT); | |||
| 31934 | SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, Op1, Op0)); | |||
| 31935 | SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, Op1, Op0)); | |||
| 31936 | SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); | |||
| 31937 | SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); | |||
| 31938 | SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); | |||
| 31939 | SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); | |||
| 31940 | return getPack(DAG, Subtarget, DL, VT, Lo, Hi, !IsFSHR); | |||
| 31941 | } | |||
| 31942 | ||||
| 31943 | // Fallback to generic expansion. | |||
| 31944 | return SDValue(); | |||
| 31945 | } | |||
| 31946 | assert((static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__ __PRETTY_FUNCTION__)) | |||
| 31947 | (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__ __PRETTY_FUNCTION__)) | |||
| 31948 | "Unexpected funnel shift type!")(static_cast <bool> ((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && "Unexpected funnel shift type!" ) ? void (0) : __assert_fail ("(VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) && \"Unexpected funnel shift type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31948, __extension__ __PRETTY_FUNCTION__)); | |||
| 31949 | ||||
| 31950 | // Expand slow SHLD/SHRD cases if we are not optimizing for size. | |||
| 31951 | bool OptForSize = DAG.shouldOptForSize(); | |||
| 31952 | bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow(); | |||
| 31953 | ||||
| 31954 | // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw. | |||
| 31955 | // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))). | |||
| 31956 | if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) && | |||
| 31957 | !isa<ConstantSDNode>(Amt)) { | |||
| 31958 | SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType()); | |||
| 31959 | SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType()); | |||
| 31960 | Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32); | |||
| 31961 | Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32); | |||
| 31962 | Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask); | |||
| 31963 | SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift); | |||
| 31964 | Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1); | |||
| 31965 | if (IsFSHR) { | |||
| 31966 | Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt); | |||
| 31967 | } else { | |||
| 31968 | Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt); | |||
| 31969 | Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift); | |||
| 31970 | } | |||
| 31971 | return DAG.getZExtOrTrunc(Res, DL, VT); | |||
| 31972 | } | |||
| 31973 | ||||
| 31974 | if (VT == MVT::i8 || ExpandFunnel) | |||
| 31975 | return SDValue(); | |||
| 31976 | ||||
| 31977 | // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo. | |||
| 31978 | if (VT == MVT::i16) { | |||
| 31979 | Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, | |||
| 31980 | DAG.getConstant(15, DL, Amt.getValueType())); | |||
| 31981 | unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL); | |||
| 31982 | return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt); | |||
| 31983 | } | |||
| 31984 | ||||
| 31985 | return Op; | |||
| 31986 | } | |||
| 31987 | ||||
| 31988 | static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, | |||
| 31989 | SelectionDAG &DAG) { | |||
| 31990 | MVT VT = Op.getSimpleValueType(); | |||
| 31991 | assert(VT.isVector() && "Custom lowering only for vector rotates!")(static_cast <bool> (VT.isVector() && "Custom lowering only for vector rotates!" ) ? void (0) : __assert_fail ("VT.isVector() && \"Custom lowering only for vector rotates!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 31991, __extension__ __PRETTY_FUNCTION__)); | |||
| 31992 | ||||
| 31993 | SDLoc DL(Op); | |||
| 31994 | SDValue R = Op.getOperand(0); | |||
| 31995 | SDValue Amt = Op.getOperand(1); | |||
| 31996 | unsigned Opcode = Op.getOpcode(); | |||
| 31997 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 31998 | int NumElts = VT.getVectorNumElements(); | |||
| 31999 | bool IsROTL = Opcode == ISD::ROTL; | |||
| 32000 | ||||
| 32001 | // Check for constant splat rotation amount. | |||
| 32002 | APInt CstSplatValue; | |||
| 32003 | bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue); | |||
| 32004 | ||||
| 32005 | // Check for splat rotate by zero. | |||
| 32006 | if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0) | |||
| 32007 | return R; | |||
| 32008 | ||||
| 32009 | // AVX512 implicitly uses modulo rotation amounts. | |||
| 32010 | if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) { | |||
| 32011 | // Attempt to rotate by immediate. | |||
| 32012 | if (IsCstSplat) { | |||
| 32013 | unsigned RotOpc = IsROTL ? X86ISD::VROTLI : X86ISD::VROTRI; | |||
| 32014 | uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); | |||
| 32015 | return DAG.getNode(RotOpc, DL, VT, R, | |||
| 32016 | DAG.getTargetConstant(RotAmt, DL, MVT::i8)); | |||
| 32017 | } | |||
| 32018 | ||||
| 32019 | // Else, fall-back on VPROLV/VPRORV. | |||
| 32020 | return Op; | |||
| 32021 | } | |||
| 32022 | ||||
| 32023 | // AVX512 VBMI2 vXi16 - lower to funnel shifts. | |||
| 32024 | if (Subtarget.hasVBMI2() && 16 == EltSizeInBits) { | |||
| 32025 | unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR; | |||
| 32026 | return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); | |||
| 32027 | } | |||
| 32028 | ||||
| 32029 | SDValue Z = DAG.getConstant(0, DL, VT); | |||
| 32030 | ||||
| 32031 | if (!IsROTL) { | |||
| 32032 | // If the ISD::ROTR amount is constant, we're always better converting to | |||
| 32033 | // ISD::ROTL. | |||
| 32034 | if (SDValue NegAmt = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {Z, Amt})) | |||
| 32035 | return DAG.getNode(ISD::ROTL, DL, VT, R, NegAmt); | |||
| 32036 | ||||
| 32037 | // XOP targets always prefers ISD::ROTL. | |||
| 32038 | if (Subtarget.hasXOP()) | |||
| 32039 | return DAG.getNode(ISD::ROTL, DL, VT, R, | |||
| 32040 | DAG.getNode(ISD::SUB, DL, VT, Z, Amt)); | |||
| 32041 | } | |||
| 32042 | ||||
| 32043 | // Split 256-bit integers on XOP/pre-AVX2 targets. | |||
| 32044 | if (VT.is256BitVector() && (Subtarget.hasXOP() || !Subtarget.hasAVX2())) | |||
| 32045 | return splitVectorIntBinary(Op, DAG); | |||
| 32046 | ||||
| 32047 | // XOP has 128-bit vector variable + immediate rotates. | |||
| 32048 | // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. | |||
| 32049 | // XOP implicitly uses modulo rotation amounts. | |||
| 32050 | if (Subtarget.hasXOP()) { | |||
| 32051 | assert(IsROTL && "Only ROTL expected")(static_cast <bool> (IsROTL && "Only ROTL expected" ) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32051, __extension__ __PRETTY_FUNCTION__)); | |||
| 32052 | assert(VT.is128BitVector() && "Only rotate 128-bit vectors!")(static_cast <bool> (VT.is128BitVector() && "Only rotate 128-bit vectors!" ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only rotate 128-bit vectors!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32052, __extension__ __PRETTY_FUNCTION__)); | |||
| 32053 | ||||
| 32054 | // Attempt to rotate by immediate. | |||
| 32055 | if (IsCstSplat) { | |||
| 32056 | uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits); | |||
| 32057 | return DAG.getNode(X86ISD::VROTLI, DL, VT, R, | |||
| 32058 | DAG.getTargetConstant(RotAmt, DL, MVT::i8)); | |||
| 32059 | } | |||
| 32060 | ||||
| 32061 | // Use general rotate by variable (per-element). | |||
| 32062 | return Op; | |||
| 32063 | } | |||
| 32064 | ||||
| 32065 | // Rotate by an uniform constant - expand back to shifts. | |||
| 32066 | if (IsCstSplat) | |||
| 32067 | return SDValue(); | |||
| 32068 | ||||
| 32069 | // Split 512-bit integers on non 512-bit BWI targets. | |||
| 32070 | if (VT.is512BitVector() && !Subtarget.useBWIRegs()) | |||
| 32071 | return splitVectorIntBinary(Op, DAG); | |||
| 32072 | ||||
| 32073 | assert((static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)) | |||
| 32074 | (VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)) | |||
| 32075 | ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)) | |||
| 32076 | Subtarget.hasAVX2()) ||(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)) | |||
| 32077 | ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) &&(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)) | |||
| 32078 | "Only vXi32/vXi16/vXi8 vector rotates supported")(static_cast <bool> ((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs ())) && "Only vXi32/vXi16/vXi8 vector rotates supported" ) ? void (0) : __assert_fail ("(VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 || ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) && Subtarget.hasAVX2()) || ((VT == MVT::v32i16 || VT == MVT::v64i8) && Subtarget.useBWIRegs())) && \"Only vXi32/vXi16/vXi8 vector rotates supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32078, __extension__ __PRETTY_FUNCTION__)); | |||
| 32079 | ||||
| 32080 | MVT ExtSVT = MVT::getIntegerVT(2 * EltSizeInBits); | |||
| 32081 | MVT ExtVT = MVT::getVectorVT(ExtSVT, NumElts / 2); | |||
| 32082 | ||||
| 32083 | SDValue AmtMask = DAG.getConstant(EltSizeInBits - 1, DL, VT); | |||
| 32084 | SDValue AmtMod = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); | |||
| 32085 | ||||
| 32086 | // Attempt to fold as unpack(x,x) << zext(splat(y)): | |||
| 32087 | // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. | |||
| 32088 | // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). | |||
| 32089 | if (EltSizeInBits == 8 || EltSizeInBits == 16 || EltSizeInBits == 32) { | |||
| 32090 | int BaseRotAmtIdx = -1; | |||
| 32091 | if (SDValue BaseRotAmt = DAG.getSplatSourceVector(AmtMod, BaseRotAmtIdx)) { | |||
| 32092 | if (EltSizeInBits == 16 && Subtarget.hasSSE41()) { | |||
| 32093 | unsigned FunnelOpc = IsROTL ? ISD::FSHL : ISD::FSHR; | |||
| 32094 | return DAG.getNode(FunnelOpc, DL, VT, R, R, Amt); | |||
| 32095 | } | |||
| 32096 | unsigned ShiftX86Opc = IsROTL ? X86ISD::VSHLI : X86ISD::VSRLI; | |||
| 32097 | SDValue Lo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); | |||
| 32098 | SDValue Hi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); | |||
| 32099 | Lo = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Lo, BaseRotAmt, | |||
| 32100 | BaseRotAmtIdx, Subtarget, DAG); | |||
| 32101 | Hi = getTargetVShiftNode(ShiftX86Opc, DL, ExtVT, Hi, BaseRotAmt, | |||
| 32102 | BaseRotAmtIdx, Subtarget, DAG); | |||
| 32103 | return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); | |||
| 32104 | } | |||
| 32105 | } | |||
| 32106 | ||||
| 32107 | // v16i8/v32i8/v64i8: Split rotation into rot4/rot2/rot1 stages and select by | |||
| 32108 | // the amount bit. | |||
| 32109 | // TODO: We're doing nothing here that we couldn't do for funnel shifts. | |||
| 32110 | if (EltSizeInBits == 8) { | |||
| 32111 | bool IsConstAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); | |||
| 32112 | MVT WideVT = | |||
| 32113 | MVT::getVectorVT(Subtarget.hasBWI() ? MVT::i16 : MVT::i32, NumElts); | |||
| 32114 | unsigned ShiftOpc = IsROTL ? ISD::SHL : ISD::SRL; | |||
| 32115 | ||||
| 32116 | // Attempt to fold as: | |||
| 32117 | // rotl(x,y) -> (((aext(x) << bw) | zext(x)) << (y & (bw-1))) >> bw. | |||
| 32118 | // rotr(x,y) -> (((aext(x) << bw) | zext(x)) >> (y & (bw-1))). | |||
| 32119 | if (supportedVectorVarShift(WideVT, Subtarget, ShiftOpc) && | |||
| 32120 | supportedVectorShiftWithImm(WideVT, Subtarget, ShiftOpc)) { | |||
| 32121 | // If we're rotating by constant, just use default promotion. | |||
| 32122 | if (IsConstAmt) | |||
| 32123 | return SDValue(); | |||
| 32124 | // See if we can perform this by widening to vXi16 or vXi32. | |||
| 32125 | R = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, R); | |||
| 32126 | R = DAG.getNode( | |||
| 32127 | ISD::OR, DL, WideVT, R, | |||
| 32128 | getTargetVShiftByConstNode(X86ISD::VSHLI, DL, WideVT, R, 8, DAG)); | |||
| 32129 | Amt = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT, AmtMod); | |||
| 32130 | R = DAG.getNode(ShiftOpc, DL, WideVT, R, Amt); | |||
| 32131 | if (IsROTL) | |||
| 32132 | R = getTargetVShiftByConstNode(X86ISD::VSRLI, DL, WideVT, R, 8, DAG); | |||
| 32133 | return DAG.getNode(ISD::TRUNCATE, DL, VT, R); | |||
| 32134 | } | |||
| 32135 | ||||
| 32136 | // Attempt to fold as unpack(x,x) << zext(y): | |||
| 32137 | // rotl(x,y) -> (unpack(x,x) << (y & (bw-1))) >> bw. | |||
| 32138 | // rotr(x,y) -> (unpack(x,x) >> (y & (bw-1))). | |||
| 32139 | if (IsConstAmt || supportedVectorVarShift(ExtVT, Subtarget, ShiftOpc)) { | |||
| 32140 | // See if we can perform this by unpacking to lo/hi vXi16. | |||
| 32141 | SDValue RLo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, R, R)); | |||
| 32142 | SDValue RHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, R, R)); | |||
| 32143 | SDValue ALo = DAG.getBitcast(ExtVT, getUnpackl(DAG, DL, VT, AmtMod, Z)); | |||
| 32144 | SDValue AHi = DAG.getBitcast(ExtVT, getUnpackh(DAG, DL, VT, AmtMod, Z)); | |||
| 32145 | SDValue Lo = DAG.getNode(ShiftOpc, DL, ExtVT, RLo, ALo); | |||
| 32146 | SDValue Hi = DAG.getNode(ShiftOpc, DL, ExtVT, RHi, AHi); | |||
| 32147 | return getPack(DAG, Subtarget, DL, VT, Lo, Hi, IsROTL); | |||
| 32148 | } | |||
| 32149 | assert((VT == MVT::v16i8 || VT == MVT::v32i8) && "Unsupported vXi8 type")(static_cast <bool> ((VT == MVT::v16i8 || VT == MVT::v32i8 ) && "Unsupported vXi8 type") ? void (0) : __assert_fail ("(VT == MVT::v16i8 || VT == MVT::v32i8) && \"Unsupported vXi8 type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32149, __extension__ __PRETTY_FUNCTION__)); | |||
| 32150 | ||||
| 32151 | // We don't need ModuloAmt here as we just peek at individual bits. | |||
| 32152 | auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) { | |||
| 32153 | if (Subtarget.hasSSE41()) { | |||
| 32154 | // On SSE41 targets we can use PBLENDVB which selects bytes based just | |||
| 32155 | // on the sign bit. | |||
| 32156 | V0 = DAG.getBitcast(VT, V0); | |||
| 32157 | V1 = DAG.getBitcast(VT, V1); | |||
| 32158 | Sel = DAG.getBitcast(VT, Sel); | |||
| 32159 | return DAG.getBitcast(SelVT, | |||
| 32160 | DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1)); | |||
| 32161 | } | |||
| 32162 | // On pre-SSE41 targets we test for the sign bit by comparing to | |||
| 32163 | // zero - a negative value will set all bits of the lanes to true | |||
| 32164 | // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. | |||
| 32165 | SDValue Z = DAG.getConstant(0, DL, SelVT); | |||
| 32166 | SDValue C = DAG.getNode(X86ISD::PCMPGT, DL, SelVT, Z, Sel); | |||
| 32167 | return DAG.getSelect(DL, SelVT, C, V0, V1); | |||
| 32168 | }; | |||
| 32169 | ||||
| 32170 | // ISD::ROTR is currently only profitable on AVX512 targets with VPTERNLOG. | |||
| 32171 | if (!IsROTL && !useVPTERNLOG(Subtarget, VT)) { | |||
| 32172 | Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); | |||
| 32173 | IsROTL = true; | |||
| 32174 | } | |||
| 32175 | ||||
| 32176 | unsigned ShiftLHS = IsROTL ? ISD::SHL : ISD::SRL; | |||
| 32177 | unsigned ShiftRHS = IsROTL ? ISD::SRL : ISD::SHL; | |||
| 32178 | ||||
| 32179 | // Turn 'a' into a mask suitable for VSELECT: a = a << 5; | |||
| 32180 | // We can safely do this using i16 shifts as we're only interested in | |||
| 32181 | // the 3 lower bits of each byte. | |||
| 32182 | Amt = DAG.getBitcast(ExtVT, Amt); | |||
| 32183 | Amt = DAG.getNode(ISD::SHL, DL, ExtVT, Amt, DAG.getConstant(5, DL, ExtVT)); | |||
| 32184 | Amt = DAG.getBitcast(VT, Amt); | |||
| 32185 | ||||
| 32186 | // r = VSELECT(r, rot(r, 4), a); | |||
| 32187 | SDValue M; | |||
| 32188 | M = DAG.getNode( | |||
| 32189 | ISD::OR, DL, VT, | |||
| 32190 | DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(4, DL, VT)), | |||
| 32191 | DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(4, DL, VT))); | |||
| 32192 | R = SignBitSelect(VT, Amt, M, R); | |||
| 32193 | ||||
| 32194 | // a += a | |||
| 32195 | Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); | |||
| 32196 | ||||
| 32197 | // r = VSELECT(r, rot(r, 2), a); | |||
| 32198 | M = DAG.getNode( | |||
| 32199 | ISD::OR, DL, VT, | |||
| 32200 | DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(2, DL, VT)), | |||
| 32201 | DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(6, DL, VT))); | |||
| 32202 | R = SignBitSelect(VT, Amt, M, R); | |||
| 32203 | ||||
| 32204 | // a += a | |||
| 32205 | Amt = DAG.getNode(ISD::ADD, DL, VT, Amt, Amt); | |||
| 32206 | ||||
| 32207 | // return VSELECT(r, rot(r, 1), a); | |||
| 32208 | M = DAG.getNode( | |||
| 32209 | ISD::OR, DL, VT, | |||
| 32210 | DAG.getNode(ShiftLHS, DL, VT, R, DAG.getConstant(1, DL, VT)), | |||
| 32211 | DAG.getNode(ShiftRHS, DL, VT, R, DAG.getConstant(7, DL, VT))); | |||
| 32212 | return SignBitSelect(VT, Amt, M, R); | |||
| 32213 | } | |||
| 32214 | ||||
| 32215 | bool IsSplatAmt = DAG.isSplatValue(Amt); | |||
| 32216 | bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode()); | |||
| 32217 | bool LegalVarShifts = supportedVectorVarShift(VT, Subtarget, ISD::SHL) && | |||
| 32218 | supportedVectorVarShift(VT, Subtarget, ISD::SRL); | |||
| 32219 | ||||
| 32220 | // Fallback for splats + all supported variable shifts. | |||
| 32221 | // Fallback for non-constants AVX2 vXi16 as well. | |||
| 32222 | if (IsSplatAmt || LegalVarShifts || (Subtarget.hasAVX2() && !ConstantAmt)) { | |||
| 32223 | Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); | |||
| 32224 | SDValue AmtR = DAG.getConstant(EltSizeInBits, DL, VT); | |||
| 32225 | AmtR = DAG.getNode(ISD::SUB, DL, VT, AmtR, Amt); | |||
| 32226 | SDValue SHL = DAG.getNode(IsROTL ? ISD::SHL : ISD::SRL, DL, VT, R, Amt); | |||
| 32227 | SDValue SRL = DAG.getNode(IsROTL ? ISD::SRL : ISD::SHL, DL, VT, R, AmtR); | |||
| 32228 | return DAG.getNode(ISD::OR, DL, VT, SHL, SRL); | |||
| 32229 | } | |||
| 32230 | ||||
| 32231 | // Everything below assumes ISD::ROTL. | |||
| 32232 | if (!IsROTL) { | |||
| 32233 | Amt = DAG.getNode(ISD::SUB, DL, VT, Z, Amt); | |||
| 32234 | IsROTL = true; | |||
| 32235 | } | |||
| 32236 | ||||
| 32237 | // ISD::ROT* uses modulo rotate amounts. | |||
| 32238 | Amt = DAG.getNode(ISD::AND, DL, VT, Amt, AmtMask); | |||
| 32239 | ||||
| 32240 | assert(IsROTL && "Only ROTL supported")(static_cast <bool> (IsROTL && "Only ROTL supported" ) ? void (0) : __assert_fail ("IsROTL && \"Only ROTL supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32240, __extension__ __PRETTY_FUNCTION__)); | |||
| 32241 | ||||
| 32242 | // As with shifts, attempt to convert the rotation amount to a multiplication | |||
| 32243 | // factor, fallback to general expansion. | |||
| 32244 | SDValue Scale = convertShiftLeftToScale(Amt, DL, Subtarget, DAG); | |||
| 32245 | if (!Scale) | |||
| 32246 | return SDValue(); | |||
| 32247 | ||||
| 32248 | // v8i16/v16i16: perform unsigned multiply hi/lo and OR the results. | |||
| 32249 | if (EltSizeInBits == 16) { | |||
| 32250 | SDValue Lo = DAG.getNode(ISD::MUL, DL, VT, R, Scale); | |||
| 32251 | SDValue Hi = DAG.getNode(ISD::MULHU, DL, VT, R, Scale); | |||
| 32252 | return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); | |||
| 32253 | } | |||
| 32254 | ||||
| 32255 | // v4i32: make use of the PMULUDQ instruction to multiply 2 lanes of v4i32 | |||
| 32256 | // to v2i64 results at a time. The upper 32-bits contain the wrapped bits | |||
| 32257 | // that can then be OR'd with the lower 32-bits. | |||
| 32258 | assert(VT == MVT::v4i32 && "Only v4i32 vector rotate expected")(static_cast <bool> (VT == MVT::v4i32 && "Only v4i32 vector rotate expected" ) ? void (0) : __assert_fail ("VT == MVT::v4i32 && \"Only v4i32 vector rotate expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32258, __extension__ __PRETTY_FUNCTION__)); | |||
| 32259 | static const int OddMask[] = {1, -1, 3, -1}; | |||
| 32260 | SDValue R13 = DAG.getVectorShuffle(VT, DL, R, R, OddMask); | |||
| 32261 | SDValue Scale13 = DAG.getVectorShuffle(VT, DL, Scale, Scale, OddMask); | |||
| 32262 | ||||
| 32263 | SDValue Res02 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, | |||
| 32264 | DAG.getBitcast(MVT::v2i64, R), | |||
| 32265 | DAG.getBitcast(MVT::v2i64, Scale)); | |||
| 32266 | SDValue Res13 = DAG.getNode(X86ISD::PMULUDQ, DL, MVT::v2i64, | |||
| 32267 | DAG.getBitcast(MVT::v2i64, R13), | |||
| 32268 | DAG.getBitcast(MVT::v2i64, Scale13)); | |||
| 32269 | Res02 = DAG.getBitcast(VT, Res02); | |||
| 32270 | Res13 = DAG.getBitcast(VT, Res13); | |||
| 32271 | ||||
| 32272 | return DAG.getNode(ISD::OR, DL, VT, | |||
| 32273 | DAG.getVectorShuffle(VT, DL, Res02, Res13, {0, 4, 2, 6}), | |||
| 32274 | DAG.getVectorShuffle(VT, DL, Res02, Res13, {1, 5, 3, 7})); | |||
| 32275 | } | |||
| 32276 | ||||
| 32277 | /// Returns true if the operand type is exactly twice the native width, and | |||
| 32278 | /// the corresponding cmpxchg8b or cmpxchg16b instruction is available. | |||
| 32279 | /// Used to know whether to use cmpxchg8/16b when expanding atomic operations | |||
| 32280 | /// (otherwise we leave them alone to become __sync_fetch_and_... calls). | |||
| 32281 | bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const { | |||
| 32282 | unsigned OpWidth = MemType->getPrimitiveSizeInBits(); | |||
| 32283 | ||||
| 32284 | if (OpWidth == 64) | |||
| 32285 | return Subtarget.canUseCMPXCHG8B() && !Subtarget.is64Bit(); | |||
| 32286 | if (OpWidth == 128) | |||
| 32287 | return Subtarget.canUseCMPXCHG16B(); | |||
| 32288 | ||||
| 32289 | return false; | |||
| 32290 | } | |||
| 32291 | ||||
| 32292 | TargetLoweringBase::AtomicExpansionKind | |||
| 32293 | X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const { | |||
| 32294 | Type *MemType = SI->getValueOperand()->getType(); | |||
| 32295 | ||||
| 32296 | bool NoImplicitFloatOps = | |||
| 32297 | SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); | |||
| 32298 | if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | |||
| 32299 | !Subtarget.useSoftFloat() && !NoImplicitFloatOps && | |||
| 32300 | (Subtarget.hasSSE1() || Subtarget.hasX87())) | |||
| 32301 | return AtomicExpansionKind::None; | |||
| 32302 | ||||
| 32303 | return needsCmpXchgNb(MemType) ? AtomicExpansionKind::Expand | |||
| 32304 | : AtomicExpansionKind::None; | |||
| 32305 | } | |||
| 32306 | ||||
| 32307 | // Note: this turns large loads into lock cmpxchg8b/16b. | |||
| 32308 | // TODO: In 32-bit mode, use MOVLPS when SSE1 is available? | |||
| 32309 | TargetLowering::AtomicExpansionKind | |||
| 32310 | X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const { | |||
| 32311 | Type *MemType = LI->getType(); | |||
| 32312 | ||||
| 32313 | // If this a 64 bit atomic load on a 32-bit target and SSE2 is enabled, we | |||
| 32314 | // can use movq to do the load. If we have X87 we can load into an 80-bit | |||
| 32315 | // X87 register and store it to a stack temporary. | |||
| 32316 | bool NoImplicitFloatOps = | |||
| 32317 | LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat); | |||
| 32318 | if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() && | |||
| 32319 | !Subtarget.useSoftFloat() && !NoImplicitFloatOps && | |||
| 32320 | (Subtarget.hasSSE1() || Subtarget.hasX87())) | |||
| 32321 | return AtomicExpansionKind::None; | |||
| 32322 | ||||
| 32323 | return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg | |||
| 32324 | : AtomicExpansionKind::None; | |||
| 32325 | } | |||
| 32326 | ||||
| 32327 | enum BitTestKind : unsigned { | |||
| 32328 | UndefBit, | |||
| 32329 | ConstantBit, | |||
| 32330 | NotConstantBit, | |||
| 32331 | ShiftBit, | |||
| 32332 | NotShiftBit | |||
| 32333 | }; | |||
| 32334 | ||||
| 32335 | static std::pair<Value *, BitTestKind> FindSingleBitChange(Value *V) { | |||
| 32336 | using namespace llvm::PatternMatch; | |||
| 32337 | BitTestKind BTK = UndefBit; | |||
| 32338 | auto *C = dyn_cast<ConstantInt>(V); | |||
| 32339 | if (C) { | |||
| 32340 | // Check if V is a power of 2 or NOT power of 2. | |||
| 32341 | if (isPowerOf2_64(C->getZExtValue())) | |||
| 32342 | BTK = ConstantBit; | |||
| 32343 | else if (isPowerOf2_64((~C->getValue()).getZExtValue())) | |||
| 32344 | BTK = NotConstantBit; | |||
| 32345 | return {V, BTK}; | |||
| 32346 | } | |||
| 32347 | ||||
| 32348 | // Check if V is some power of 2 pattern known to be non-zero | |||
| 32349 | auto *I = dyn_cast<Instruction>(V); | |||
| 32350 | if (I) { | |||
| 32351 | bool Not = false; | |||
| 32352 | // Check if we have a NOT | |||
| 32353 | Value *PeekI; | |||
| 32354 | if (match(I, m_c_Xor(m_Value(PeekI), m_AllOnes())) || | |||
| 32355 | match(I, m_Sub(m_AllOnes(), m_Value(PeekI)))) { | |||
| 32356 | Not = true; | |||
| 32357 | I = dyn_cast<Instruction>(PeekI); | |||
| 32358 | ||||
| 32359 | // If I is constant, it will fold and we can evaluate later. If its an | |||
| 32360 | // argument or something of that nature, we can't analyze. | |||
| 32361 | if (I == nullptr) | |||
| 32362 | return {nullptr, UndefBit}; | |||
| 32363 | } | |||
| 32364 | // We can only use 1 << X without more sophisticated analysis. C << X where | |||
| 32365 | // C is a power of 2 but not 1 can result in zero which cannot be translated | |||
| 32366 | // to bittest. Likewise any C >> X (either arith or logical) can be zero. | |||
| 32367 | if (I->getOpcode() == Instruction::Shl) { | |||
| 32368 | // Todo(1): The cmpxchg case is pretty costly so matching `BLSI(X)`, `X & | |||
| 32369 | // -X` and some other provable power of 2 patterns that we can use CTZ on | |||
| 32370 | // may be profitable. | |||
| 32371 | // Todo(2): It may be possible in some cases to prove that Shl(C, X) is | |||
| 32372 | // non-zero even where C != 1. Likewise LShr(C, X) and AShr(C, X) may also | |||
| 32373 | // be provably a non-zero power of 2. | |||
| 32374 | // Todo(3): ROTL and ROTR patterns on a power of 2 C should also be | |||
| 32375 | // transformable to bittest. | |||
| 32376 | auto *ShiftVal = dyn_cast<ConstantInt>(I->getOperand(0)); | |||
| 32377 | if (!ShiftVal) | |||
| 32378 | return {nullptr, UndefBit}; | |||
| 32379 | if (ShiftVal->equalsInt(1)) | |||
| 32380 | BTK = Not ? NotShiftBit : ShiftBit; | |||
| 32381 | ||||
| 32382 | if (BTK == UndefBit) | |||
| 32383 | return {nullptr, UndefBit}; | |||
| 32384 | ||||
| 32385 | Value *BitV = I->getOperand(1); | |||
| 32386 | ||||
| 32387 | Value *AndOp; | |||
| 32388 | const APInt *AndC; | |||
| 32389 | if (match(BitV, m_c_And(m_Value(AndOp), m_APInt(AndC)))) { | |||
| 32390 | // Read past a shiftmask instruction to find count | |||
| 32391 | if (*AndC == (I->getType()->getPrimitiveSizeInBits() - 1)) | |||
| 32392 | BitV = AndOp; | |||
| 32393 | } | |||
| 32394 | return {BitV, BTK}; | |||
| 32395 | } | |||
| 32396 | } | |||
| 32397 | return {nullptr, UndefBit}; | |||
| 32398 | } | |||
| 32399 | ||||
| 32400 | TargetLowering::AtomicExpansionKind | |||
| 32401 | X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const { | |||
| 32402 | using namespace llvm::PatternMatch; | |||
| 32403 | // If the atomicrmw's result isn't actually used, we can just add a "lock" | |||
| 32404 | // prefix to a normal instruction for these operations. | |||
| 32405 | if (AI->use_empty()) | |||
| 32406 | return AtomicExpansionKind::None; | |||
| 32407 | ||||
| 32408 | if (AI->getOperation() == AtomicRMWInst::Xor) { | |||
| 32409 | // A ^ SignBit -> A + SignBit. This allows us to use `xadd` which is | |||
| 32410 | // preferable to both `cmpxchg` and `btc`. | |||
| 32411 | if (match(AI->getOperand(1), m_SignMask())) | |||
| 32412 | return AtomicExpansionKind::None; | |||
| 32413 | } | |||
| 32414 | ||||
| 32415 | // If the atomicrmw's result is used by a single bit AND, we may use | |||
| 32416 | // bts/btr/btc instruction for these operations. | |||
| 32417 | // Note: InstCombinePass can cause a de-optimization here. It replaces the | |||
| 32418 | // SETCC(And(AtomicRMW(P, power_of_2), power_of_2)) with LShr and Xor | |||
| 32419 | // (depending on CC). This pattern can only use bts/btr/btc but we don't | |||
| 32420 | // detect it. | |||
| 32421 | Instruction *I = AI->user_back(); | |||
| 32422 | auto BitChange = FindSingleBitChange(AI->getValOperand()); | |||
| 32423 | if (BitChange.second == UndefBit || !AI->hasOneUse() || | |||
| 32424 | I->getOpcode() != Instruction::And || | |||
| 32425 | AI->getType()->getPrimitiveSizeInBits() == 8 || | |||
| 32426 | AI->getParent() != I->getParent()) | |||
| 32427 | return AtomicExpansionKind::CmpXChg; | |||
| 32428 | ||||
| 32429 | unsigned OtherIdx = I->getOperand(0) == AI ? 1 : 0; | |||
| 32430 | ||||
| 32431 | // This is a redundant AND, it should get cleaned up elsewhere. | |||
| 32432 | if (AI == I->getOperand(OtherIdx)) | |||
| 32433 | return AtomicExpansionKind::CmpXChg; | |||
| 32434 | ||||
| 32435 | // The following instruction must be a AND single bit. | |||
| 32436 | if (BitChange.second == ConstantBit || BitChange.second == NotConstantBit) { | |||
| 32437 | auto *C1 = cast<ConstantInt>(AI->getValOperand()); | |||
| 32438 | auto *C2 = dyn_cast<ConstantInt>(I->getOperand(OtherIdx)); | |||
| 32439 | if (!C2 || !isPowerOf2_64(C2->getZExtValue())) { | |||
| 32440 | return AtomicExpansionKind::CmpXChg; | |||
| 32441 | } | |||
| 32442 | if (AI->getOperation() == AtomicRMWInst::And) { | |||
| 32443 | return ~C1->getValue() == C2->getValue() | |||
| 32444 | ? AtomicExpansionKind::BitTestIntrinsic | |||
| 32445 | : AtomicExpansionKind::CmpXChg; | |||
| 32446 | } | |||
| 32447 | return C1 == C2 ? AtomicExpansionKind::BitTestIntrinsic | |||
| 32448 | : AtomicExpansionKind::CmpXChg; | |||
| 32449 | } | |||
| 32450 | ||||
| 32451 | assert(BitChange.second == ShiftBit || BitChange.second == NotShiftBit)(static_cast <bool> (BitChange.second == ShiftBit || BitChange .second == NotShiftBit) ? void (0) : __assert_fail ("BitChange.second == ShiftBit || BitChange.second == NotShiftBit" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32451, __extension__ __PRETTY_FUNCTION__)); | |||
| 32452 | ||||
| 32453 | auto BitTested = FindSingleBitChange(I->getOperand(OtherIdx)); | |||
| 32454 | if (BitTested.second != ShiftBit && BitTested.second != NotShiftBit) | |||
| 32455 | return AtomicExpansionKind::CmpXChg; | |||
| 32456 | ||||
| 32457 | assert(BitChange.first != nullptr && BitTested.first != nullptr)(static_cast <bool> (BitChange.first != nullptr && BitTested.first != nullptr) ? void (0) : __assert_fail ("BitChange.first != nullptr && BitTested.first != nullptr" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32457, __extension__ __PRETTY_FUNCTION__)); | |||
| 32458 | ||||
| 32459 | // If shift amounts are not the same we can't use BitTestIntrinsic. | |||
| 32460 | if (BitChange.first != BitTested.first) | |||
| 32461 | return AtomicExpansionKind::CmpXChg; | |||
| 32462 | ||||
| 32463 | // If atomic AND need to be masking all be one bit and testing the one bit | |||
| 32464 | // unset in the mask. | |||
| 32465 | if (AI->getOperation() == AtomicRMWInst::And) | |||
| 32466 | return (BitChange.second == NotShiftBit && BitTested.second == ShiftBit) | |||
| 32467 | ? AtomicExpansionKind::BitTestIntrinsic | |||
| 32468 | : AtomicExpansionKind::CmpXChg; | |||
| 32469 | ||||
| 32470 | // If atomic XOR/OR need to be setting and testing the same bit. | |||
| 32471 | return (BitChange.second == ShiftBit && BitTested.second == ShiftBit) | |||
| 32472 | ? AtomicExpansionKind::BitTestIntrinsic | |||
| 32473 | : AtomicExpansionKind::CmpXChg; | |||
| 32474 | } | |||
| 32475 | ||||
| 32476 | void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const { | |||
| 32477 | IRBuilder<> Builder(AI); | |||
| 32478 | Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); | |||
| 32479 | Intrinsic::ID IID_C = Intrinsic::not_intrinsic; | |||
| 32480 | Intrinsic::ID IID_I = Intrinsic::not_intrinsic; | |||
| 32481 | switch (AI->getOperation()) { | |||
| 32482 | default: | |||
| 32483 | llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation", "llvm/lib/Target/X86/X86ISelLowering.cpp", 32483); | |||
| 32484 | case AtomicRMWInst::Or: | |||
| 32485 | IID_C = Intrinsic::x86_atomic_bts; | |||
| 32486 | IID_I = Intrinsic::x86_atomic_bts_rm; | |||
| 32487 | break; | |||
| 32488 | case AtomicRMWInst::Xor: | |||
| 32489 | IID_C = Intrinsic::x86_atomic_btc; | |||
| 32490 | IID_I = Intrinsic::x86_atomic_btc_rm; | |||
| 32491 | break; | |||
| 32492 | case AtomicRMWInst::And: | |||
| 32493 | IID_C = Intrinsic::x86_atomic_btr; | |||
| 32494 | IID_I = Intrinsic::x86_atomic_btr_rm; | |||
| 32495 | break; | |||
| 32496 | } | |||
| 32497 | Instruction *I = AI->user_back(); | |||
| 32498 | LLVMContext &Ctx = AI->getContext(); | |||
| 32499 | Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), | |||
| 32500 | Type::getInt8PtrTy(Ctx)); | |||
| 32501 | Function *BitTest = nullptr; | |||
| 32502 | Value *Result = nullptr; | |||
| 32503 | auto BitTested = FindSingleBitChange(AI->getValOperand()); | |||
| 32504 | assert(BitTested.first != nullptr)(static_cast <bool> (BitTested.first != nullptr) ? void (0) : __assert_fail ("BitTested.first != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 32504, __extension__ __PRETTY_FUNCTION__)); | |||
| 32505 | ||||
| 32506 | if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) { | |||
| 32507 | auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0)); | |||
| 32508 | ||||
| 32509 | BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_C, AI->getType()); | |||
| 32510 | ||||
| 32511 | unsigned Imm = llvm::countr_zero(C->getZExtValue()); | |||
| 32512 | Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)}); | |||
| 32513 | } else { | |||
| 32514 | BitTest = Intrinsic::getDeclaration(AI->getModule(), IID_I, AI->getType()); | |||
| 32515 | ||||
| 32516 | assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit)(static_cast <bool> (BitTested.second == ShiftBit || BitTested .second == NotShiftBit) ? void (0) : __assert_fail ("BitTested.second == ShiftBit || BitTested.second == NotShiftBit" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32516, __extension__ __PRETTY_FUNCTION__)); | |||
| 32517 | ||||
| 32518 | Value *SI = BitTested.first; | |||
| 32519 | assert(SI != nullptr)(static_cast <bool> (SI != nullptr) ? void (0) : __assert_fail ("SI != nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp", 32519, __extension__ __PRETTY_FUNCTION__)); | |||
| 32520 | ||||
| 32521 | // BT{S|R|C} on memory operand don't modulo bit position so we need to | |||
| 32522 | // mask it. | |||
| 32523 | unsigned ShiftBits = SI->getType()->getPrimitiveSizeInBits(); | |||
| 32524 | Value *BitPos = | |||
| 32525 | Builder.CreateAnd(SI, Builder.getIntN(ShiftBits, ShiftBits - 1)); | |||
| 32526 | // Todo(1): In many cases it may be provable that SI is less than | |||
| 32527 | // ShiftBits in which case this mask is unnecessary | |||
| 32528 | // Todo(2): In the fairly idiomatic case of P[X / sizeof_bits(X)] OP 1 | |||
| 32529 | // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in | |||
| 32530 | // favor of just a raw BT{S|R|C}. | |||
| 32531 | ||||
| 32532 | Result = Builder.CreateCall(BitTest, {Addr, BitPos}); | |||
| 32533 | Result = Builder.CreateZExtOrTrunc(Result, AI->getType()); | |||
| 32534 | ||||
| 32535 | // If the result is only used for zero/non-zero status then we don't need to | |||
| 32536 | // shift value back. Otherwise do so. | |||
| 32537 | for (auto It = I->user_begin(); It != I->user_end(); ++It) { | |||
| 32538 | if (auto *ICmp = dyn_cast<ICmpInst>(*It)) { | |||
| 32539 | if (ICmp->isEquality()) { | |||
| 32540 | auto *C0 = dyn_cast<ConstantInt>(ICmp->getOperand(0)); | |||
| 32541 | auto *C1 = dyn_cast<ConstantInt>(ICmp->getOperand(1)); | |||
| 32542 | if (C0 || C1) { | |||
| 32543 | assert(C0 == nullptr || C1 == nullptr)(static_cast <bool> (C0 == nullptr || C1 == nullptr) ? void (0) : __assert_fail ("C0 == nullptr || C1 == nullptr", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 32543, __extension__ __PRETTY_FUNCTION__)); | |||
| 32544 | if ((C0 ? C0 : C1)->isZero()) | |||
| 32545 | continue; | |||
| 32546 | } | |||
| 32547 | } | |||
| 32548 | } | |||
| 32549 | Result = Builder.CreateShl(Result, BitPos); | |||
| 32550 | break; | |||
| 32551 | } | |||
| 32552 | } | |||
| 32553 | ||||
| 32554 | I->replaceAllUsesWith(Result); | |||
| 32555 | I->eraseFromParent(); | |||
| 32556 | AI->eraseFromParent(); | |||
| 32557 | } | |||
| 32558 | ||||
| 32559 | static bool shouldExpandCmpArithRMWInIR(AtomicRMWInst *AI) { | |||
| 32560 | using namespace llvm::PatternMatch; | |||
| 32561 | if (!AI->hasOneUse()) | |||
| 32562 | return false; | |||
| 32563 | ||||
| 32564 | Value *Op = AI->getOperand(1); | |||
| 32565 | ICmpInst::Predicate Pred; | |||
| 32566 | Instruction *I = AI->user_back(); | |||
| 32567 | AtomicRMWInst::BinOp Opc = AI->getOperation(); | |||
| 32568 | if (Opc == AtomicRMWInst::Add) { | |||
| 32569 | if (match(I, m_c_ICmp(Pred, m_Sub(m_ZeroInt(), m_Specific(Op)), m_Value()))) | |||
| 32570 | return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; | |||
| 32571 | if (match(I, m_OneUse(m_c_Add(m_Specific(Op), m_Value())))) { | |||
| 32572 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) | |||
| 32573 | return Pred == CmpInst::ICMP_SLT; | |||
| 32574 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) | |||
| 32575 | return Pred == CmpInst::ICMP_SGT; | |||
| 32576 | } | |||
| 32577 | return false; | |||
| 32578 | } | |||
| 32579 | if (Opc == AtomicRMWInst::Sub) { | |||
| 32580 | if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) | |||
| 32581 | return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; | |||
| 32582 | if (match(I, m_OneUse(m_Sub(m_Value(), m_Specific(Op))))) { | |||
| 32583 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) | |||
| 32584 | return Pred == CmpInst::ICMP_SLT; | |||
| 32585 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) | |||
| 32586 | return Pred == CmpInst::ICMP_SGT; | |||
| 32587 | } | |||
| 32588 | return false; | |||
| 32589 | } | |||
| 32590 | if ((Opc == AtomicRMWInst::Or && | |||
| 32591 | match(I, m_OneUse(m_c_Or(m_Specific(Op), m_Value())))) || | |||
| 32592 | (Opc == AtomicRMWInst::And && | |||
| 32593 | match(I, m_OneUse(m_c_And(m_Specific(Op), m_Value()))))) { | |||
| 32594 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) | |||
| 32595 | return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE || | |||
| 32596 | Pred == CmpInst::ICMP_SLT; | |||
| 32597 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) | |||
| 32598 | return Pred == CmpInst::ICMP_SGT; | |||
| 32599 | return false; | |||
| 32600 | } | |||
| 32601 | if (Opc == AtomicRMWInst::Xor) { | |||
| 32602 | if (match(I, m_c_ICmp(Pred, m_Specific(Op), m_Value()))) | |||
| 32603 | return Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE; | |||
| 32604 | if (match(I, m_OneUse(m_c_Xor(m_Specific(Op), m_Value())))) { | |||
| 32605 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_ZeroInt()))) | |||
| 32606 | return Pred == CmpInst::ICMP_SLT; | |||
| 32607 | if (match(I->user_back(), m_ICmp(Pred, m_Value(), m_AllOnes()))) | |||
| 32608 | return Pred == CmpInst::ICMP_SGT; | |||
| 32609 | } | |||
| 32610 | return false; | |||
| 32611 | } | |||
| 32612 | ||||
| 32613 | return false; | |||
| 32614 | } | |||
| 32615 | ||||
| 32616 | void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic( | |||
| 32617 | AtomicRMWInst *AI) const { | |||
| 32618 | IRBuilder<> Builder(AI); | |||
| 32619 | Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); | |||
| 32620 | Instruction *TempI = nullptr; | |||
| 32621 | LLVMContext &Ctx = AI->getContext(); | |||
| 32622 | ICmpInst *ICI = dyn_cast<ICmpInst>(AI->user_back()); | |||
| 32623 | if (!ICI) { | |||
| 32624 | TempI = AI->user_back(); | |||
| 32625 | assert(TempI->hasOneUse() && "Must have one use")(static_cast <bool> (TempI->hasOneUse() && "Must have one use" ) ? void (0) : __assert_fail ("TempI->hasOneUse() && \"Must have one use\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32625, __extension__ __PRETTY_FUNCTION__)); | |||
| 32626 | ICI = cast<ICmpInst>(TempI->user_back()); | |||
| 32627 | } | |||
| 32628 | X86::CondCode CC = X86::COND_INVALID; | |||
| 32629 | ICmpInst::Predicate Pred = ICI->getPredicate(); | |||
| 32630 | switch (Pred) { | |||
| 32631 | default: | |||
| 32632 | llvm_unreachable("Not supported Pred")::llvm::llvm_unreachable_internal("Not supported Pred", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 32632); | |||
| 32633 | case CmpInst::ICMP_EQ: | |||
| 32634 | CC = X86::COND_E; | |||
| 32635 | break; | |||
| 32636 | case CmpInst::ICMP_NE: | |||
| 32637 | CC = X86::COND_NE; | |||
| 32638 | break; | |||
| 32639 | case CmpInst::ICMP_SLT: | |||
| 32640 | CC = X86::COND_S; | |||
| 32641 | break; | |||
| 32642 | case CmpInst::ICMP_SGT: | |||
| 32643 | CC = X86::COND_NS; | |||
| 32644 | break; | |||
| 32645 | } | |||
| 32646 | Intrinsic::ID IID = Intrinsic::not_intrinsic; | |||
| 32647 | switch (AI->getOperation()) { | |||
| 32648 | default: | |||
| 32649 | llvm_unreachable("Unknown atomic operation")::llvm::llvm_unreachable_internal("Unknown atomic operation", "llvm/lib/Target/X86/X86ISelLowering.cpp", 32649); | |||
| 32650 | case AtomicRMWInst::Add: | |||
| 32651 | IID = Intrinsic::x86_atomic_add_cc; | |||
| 32652 | break; | |||
| 32653 | case AtomicRMWInst::Sub: | |||
| 32654 | IID = Intrinsic::x86_atomic_sub_cc; | |||
| 32655 | break; | |||
| 32656 | case AtomicRMWInst::Or: | |||
| 32657 | IID = Intrinsic::x86_atomic_or_cc; | |||
| 32658 | break; | |||
| 32659 | case AtomicRMWInst::And: | |||
| 32660 | IID = Intrinsic::x86_atomic_and_cc; | |||
| 32661 | break; | |||
| 32662 | case AtomicRMWInst::Xor: | |||
| 32663 | IID = Intrinsic::x86_atomic_xor_cc; | |||
| 32664 | break; | |||
| 32665 | } | |||
| 32666 | Function *CmpArith = | |||
| 32667 | Intrinsic::getDeclaration(AI->getModule(), IID, AI->getType()); | |||
| 32668 | Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(), | |||
| 32669 | Type::getInt8PtrTy(Ctx)); | |||
| 32670 | Value *Call = Builder.CreateCall( | |||
| 32671 | CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)}); | |||
| 32672 | Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx)); | |||
| 32673 | ICI->replaceAllUsesWith(Result); | |||
| 32674 | ICI->eraseFromParent(); | |||
| 32675 | if (TempI) | |||
| 32676 | TempI->eraseFromParent(); | |||
| 32677 | AI->eraseFromParent(); | |||
| 32678 | } | |||
| 32679 | ||||
| 32680 | TargetLowering::AtomicExpansionKind | |||
| 32681 | X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { | |||
| 32682 | unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; | |||
| 32683 | Type *MemType = AI->getType(); | |||
| 32684 | ||||
| 32685 | // If the operand is too big, we must see if cmpxchg8/16b is available | |||
| 32686 | // and default to library calls otherwise. | |||
| 32687 | if (MemType->getPrimitiveSizeInBits() > NativeWidth) { | |||
| 32688 | return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg | |||
| 32689 | : AtomicExpansionKind::None; | |||
| 32690 | } | |||
| 32691 | ||||
| 32692 | AtomicRMWInst::BinOp Op = AI->getOperation(); | |||
| 32693 | switch (Op) { | |||
| 32694 | case AtomicRMWInst::Xchg: | |||
| 32695 | return AtomicExpansionKind::None; | |||
| 32696 | case AtomicRMWInst::Add: | |||
| 32697 | case AtomicRMWInst::Sub: | |||
| 32698 | if (shouldExpandCmpArithRMWInIR(AI)) | |||
| 32699 | return AtomicExpansionKind::CmpArithIntrinsic; | |||
| 32700 | // It's better to use xadd, xsub or xchg for these in other cases. | |||
| 32701 | return AtomicExpansionKind::None; | |||
| 32702 | case AtomicRMWInst::Or: | |||
| 32703 | case AtomicRMWInst::And: | |||
| 32704 | case AtomicRMWInst::Xor: | |||
| 32705 | if (shouldExpandCmpArithRMWInIR(AI)) | |||
| 32706 | return AtomicExpansionKind::CmpArithIntrinsic; | |||
| 32707 | return shouldExpandLogicAtomicRMWInIR(AI); | |||
| 32708 | case AtomicRMWInst::Nand: | |||
| 32709 | case AtomicRMWInst::Max: | |||
| 32710 | case AtomicRMWInst::Min: | |||
| 32711 | case AtomicRMWInst::UMax: | |||
| 32712 | case AtomicRMWInst::UMin: | |||
| 32713 | case AtomicRMWInst::FAdd: | |||
| 32714 | case AtomicRMWInst::FSub: | |||
| 32715 | case AtomicRMWInst::FMax: | |||
| 32716 | case AtomicRMWInst::FMin: | |||
| 32717 | case AtomicRMWInst::UIncWrap: | |||
| 32718 | case AtomicRMWInst::UDecWrap: | |||
| 32719 | default: | |||
| 32720 | // These always require a non-trivial set of data operations on x86. We must | |||
| 32721 | // use a cmpxchg loop. | |||
| 32722 | return AtomicExpansionKind::CmpXChg; | |||
| 32723 | } | |||
| 32724 | } | |||
| 32725 | ||||
| 32726 | LoadInst * | |||
| 32727 | X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { | |||
| 32728 | unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32; | |||
| 32729 | Type *MemType = AI->getType(); | |||
| 32730 | // Accesses larger than the native width are turned into cmpxchg/libcalls, so | |||
| 32731 | // there is no benefit in turning such RMWs into loads, and it is actually | |||
| 32732 | // harmful as it introduces a mfence. | |||
| 32733 | if (MemType->getPrimitiveSizeInBits() > NativeWidth) | |||
| 32734 | return nullptr; | |||
| 32735 | ||||
| 32736 | // If this is a canonical idempotent atomicrmw w/no uses, we have a better | |||
| 32737 | // lowering available in lowerAtomicArith. | |||
| 32738 | // TODO: push more cases through this path. | |||
| 32739 | if (auto *C = dyn_cast<ConstantInt>(AI->getValOperand())) | |||
| 32740 | if (AI->getOperation() == AtomicRMWInst::Or && C->isZero() && | |||
| 32741 | AI->use_empty()) | |||
| 32742 | return nullptr; | |||
| 32743 | ||||
| 32744 | IRBuilder<> Builder(AI); | |||
| 32745 | Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections}); | |||
| 32746 | Module *M = Builder.GetInsertBlock()->getParent()->getParent(); | |||
| 32747 | auto SSID = AI->getSyncScopeID(); | |||
| 32748 | // We must restrict the ordering to avoid generating loads with Release or | |||
| 32749 | // ReleaseAcquire orderings. | |||
| 32750 | auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); | |||
| 32751 | ||||
| 32752 | // Before the load we need a fence. Here is an example lifted from | |||
| 32753 | // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence | |||
| 32754 | // is required: | |||
| 32755 | // Thread 0: | |||
| 32756 | // x.store(1, relaxed); | |||
| 32757 | // r1 = y.fetch_add(0, release); | |||
| 32758 | // Thread 1: | |||
| 32759 | // y.fetch_add(42, acquire); | |||
| 32760 | // r2 = x.load(relaxed); | |||
| 32761 | // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is | |||
| 32762 | // lowered to just a load without a fence. A mfence flushes the store buffer, | |||
| 32763 | // making the optimization clearly correct. | |||
| 32764 | // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear | |||
| 32765 | // otherwise, we might be able to be more aggressive on relaxed idempotent | |||
| 32766 | // rmw. In practice, they do not look useful, so we don't try to be | |||
| 32767 | // especially clever. | |||
| 32768 | if (SSID == SyncScope::SingleThread) | |||
| 32769 | // FIXME: we could just insert an ISD::MEMBARRIER here, except we are at | |||
| 32770 | // the IR level, so we must wrap it in an intrinsic. | |||
| 32771 | return nullptr; | |||
| 32772 | ||||
| 32773 | if (!Subtarget.hasMFence()) | |||
| 32774 | // FIXME: it might make sense to use a locked operation here but on a | |||
| 32775 | // different cache-line to prevent cache-line bouncing. In practice it | |||
| 32776 | // is probably a small win, and x86 processors without mfence are rare | |||
| 32777 | // enough that we do not bother. | |||
| 32778 | return nullptr; | |||
| 32779 | ||||
| 32780 | Function *MFence = | |||
| 32781 | llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence); | |||
| 32782 | Builder.CreateCall(MFence, {}); | |||
| 32783 | ||||
| 32784 | // Finally we can emit the atomic load. | |||
| 32785 | LoadInst *Loaded = Builder.CreateAlignedLoad( | |||
| 32786 | AI->getType(), AI->getPointerOperand(), AI->getAlign()); | |||
| 32787 | Loaded->setAtomic(Order, SSID); | |||
| 32788 | AI->replaceAllUsesWith(Loaded); | |||
| 32789 | AI->eraseFromParent(); | |||
| 32790 | return Loaded; | |||
| 32791 | } | |||
| 32792 | ||||
| 32793 | bool X86TargetLowering::lowerAtomicStoreAsStoreSDNode(const StoreInst &SI) const { | |||
| 32794 | if (!SI.isUnordered()) | |||
| 32795 | return false; | |||
| 32796 | return ExperimentalUnorderedISEL; | |||
| 32797 | } | |||
| 32798 | bool X86TargetLowering::lowerAtomicLoadAsLoadSDNode(const LoadInst &LI) const { | |||
| 32799 | if (!LI.isUnordered()) | |||
| 32800 | return false; | |||
| 32801 | return ExperimentalUnorderedISEL; | |||
| 32802 | } | |||
| 32803 | ||||
| 32804 | ||||
| 32805 | /// Emit a locked operation on a stack location which does not change any | |||
| 32806 | /// memory location, but does involve a lock prefix. Location is chosen to be | |||
| 32807 | /// a) very likely accessed only by a single thread to minimize cache traffic, | |||
| 32808 | /// and b) definitely dereferenceable. Returns the new Chain result. | |||
| 32809 | static SDValue emitLockedStackOp(SelectionDAG &DAG, | |||
| 32810 | const X86Subtarget &Subtarget, SDValue Chain, | |||
| 32811 | const SDLoc &DL) { | |||
| 32812 | // Implementation notes: | |||
| 32813 | // 1) LOCK prefix creates a full read/write reordering barrier for memory | |||
| 32814 | // operations issued by the current processor. As such, the location | |||
| 32815 | // referenced is not relevant for the ordering properties of the instruction. | |||
| 32816 | // See: Intel® 64 and IA-32 ArchitecturesSoftware Developer’s Manual, | |||
| 32817 | // 8.2.3.9 Loads and Stores Are Not Reordered with Locked Instructions | |||
| 32818 | // 2) Using an immediate operand appears to be the best encoding choice | |||
| 32819 | // here since it doesn't require an extra register. | |||
| 32820 | // 3) OR appears to be very slightly faster than ADD. (Though, the difference | |||
| 32821 | // is small enough it might just be measurement noise.) | |||
| 32822 | // 4) When choosing offsets, there are several contributing factors: | |||
| 32823 | // a) If there's no redzone, we default to TOS. (We could allocate a cache | |||
| 32824 | // line aligned stack object to improve this case.) | |||
| 32825 | // b) To minimize our chances of introducing a false dependence, we prefer | |||
| 32826 | // to offset the stack usage from TOS slightly. | |||
| 32827 | // c) To minimize concerns about cross thread stack usage - in particular, | |||
| 32828 | // the idiomatic MyThreadPool.run([&StackVars]() {...}) pattern which | |||
| 32829 | // captures state in the TOS frame and accesses it from many threads - | |||
| 32830 | // we want to use an offset such that the offset is in a distinct cache | |||
| 32831 | // line from the TOS frame. | |||
| 32832 | // | |||
| 32833 | // For a general discussion of the tradeoffs and benchmark results, see: | |||
| 32834 | // https://shipilev.net/blog/2014/on-the-fence-with-dependencies/ | |||
| 32835 | ||||
| 32836 | auto &MF = DAG.getMachineFunction(); | |||
| 32837 | auto &TFL = *Subtarget.getFrameLowering(); | |||
| 32838 | const unsigned SPOffset = TFL.has128ByteRedZone(MF) ? -64 : 0; | |||
| 32839 | ||||
| 32840 | if (Subtarget.is64Bit()) { | |||
| 32841 | SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); | |||
| 32842 | SDValue Ops[] = { | |||
| 32843 | DAG.getRegister(X86::RSP, MVT::i64), // Base | |||
| 32844 | DAG.getTargetConstant(1, DL, MVT::i8), // Scale | |||
| 32845 | DAG.getRegister(0, MVT::i64), // Index | |||
| 32846 | DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp | |||
| 32847 | DAG.getRegister(0, MVT::i16), // Segment. | |||
| 32848 | Zero, | |||
| 32849 | Chain}; | |||
| 32850 | SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, | |||
| 32851 | MVT::Other, Ops); | |||
| 32852 | return SDValue(Res, 1); | |||
| 32853 | } | |||
| 32854 | ||||
| 32855 | SDValue Zero = DAG.getTargetConstant(0, DL, MVT::i32); | |||
| 32856 | SDValue Ops[] = { | |||
| 32857 | DAG.getRegister(X86::ESP, MVT::i32), // Base | |||
| 32858 | DAG.getTargetConstant(1, DL, MVT::i8), // Scale | |||
| 32859 | DAG.getRegister(0, MVT::i32), // Index | |||
| 32860 | DAG.getTargetConstant(SPOffset, DL, MVT::i32), // Disp | |||
| 32861 | DAG.getRegister(0, MVT::i16), // Segment. | |||
| 32862 | Zero, | |||
| 32863 | Chain | |||
| 32864 | }; | |||
| 32865 | SDNode *Res = DAG.getMachineNode(X86::OR32mi8Locked, DL, MVT::i32, | |||
| 32866 | MVT::Other, Ops); | |||
| 32867 | return SDValue(Res, 1); | |||
| 32868 | } | |||
| 32869 | ||||
| 32870 | static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, | |||
| 32871 | SelectionDAG &DAG) { | |||
| 32872 | SDLoc dl(Op); | |||
| 32873 | AtomicOrdering FenceOrdering = | |||
| 32874 | static_cast<AtomicOrdering>(Op.getConstantOperandVal(1)); | |||
| 32875 | SyncScope::ID FenceSSID = | |||
| 32876 | static_cast<SyncScope::ID>(Op.getConstantOperandVal(2)); | |||
| 32877 | ||||
| 32878 | // The only fence that needs an instruction is a sequentially-consistent | |||
| 32879 | // cross-thread fence. | |||
| 32880 | if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && | |||
| 32881 | FenceSSID == SyncScope::System) { | |||
| 32882 | if (Subtarget.hasMFence()) | |||
| 32883 | return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); | |||
| 32884 | ||||
| 32885 | SDValue Chain = Op.getOperand(0); | |||
| 32886 | return emitLockedStackOp(DAG, Subtarget, Chain, dl); | |||
| 32887 | } | |||
| 32888 | ||||
| 32889 | // MEMBARRIER is a compiler barrier; it codegens to a no-op. | |||
| 32890 | return DAG.getNode(ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0)); | |||
| 32891 | } | |||
| 32892 | ||||
| 32893 | static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget, | |||
| 32894 | SelectionDAG &DAG) { | |||
| 32895 | MVT T = Op.getSimpleValueType(); | |||
| 32896 | SDLoc DL(Op); | |||
| 32897 | unsigned Reg = 0; | |||
| 32898 | unsigned size = 0; | |||
| 32899 | switch(T.SimpleTy) { | |||
| 32900 | default: llvm_unreachable("Invalid value type!")::llvm::llvm_unreachable_internal("Invalid value type!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 32900); | |||
| 32901 | case MVT::i8: Reg = X86::AL; size = 1; break; | |||
| 32902 | case MVT::i16: Reg = X86::AX; size = 2; break; | |||
| 32903 | case MVT::i32: Reg = X86::EAX; size = 4; break; | |||
| 32904 | case MVT::i64: | |||
| 32905 | assert(Subtarget.is64Bit() && "Node not type legal!")(static_cast <bool> (Subtarget.is64Bit() && "Node not type legal!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Node not type legal!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32905, __extension__ __PRETTY_FUNCTION__)); | |||
| 32906 | Reg = X86::RAX; size = 8; | |||
| 32907 | break; | |||
| 32908 | } | |||
| 32909 | SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg, | |||
| 32910 | Op.getOperand(2), SDValue()); | |||
| 32911 | SDValue Ops[] = { cpIn.getValue(0), | |||
| 32912 | Op.getOperand(1), | |||
| 32913 | Op.getOperand(3), | |||
| 32914 | DAG.getTargetConstant(size, DL, MVT::i8), | |||
| 32915 | cpIn.getValue(1) }; | |||
| 32916 | SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 32917 | MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand(); | |||
| 32918 | SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys, | |||
| 32919 | Ops, T, MMO); | |||
| 32920 | ||||
| 32921 | SDValue cpOut = | |||
| 32922 | DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1)); | |||
| 32923 | SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS, | |||
| 32924 | MVT::i32, cpOut.getValue(2)); | |||
| 32925 | SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG); | |||
| 32926 | ||||
| 32927 | return DAG.getNode(ISD::MERGE_VALUES, DL, Op->getVTList(), | |||
| 32928 | cpOut, Success, EFLAGS.getValue(1)); | |||
| 32929 | } | |||
| 32930 | ||||
| 32931 | // Create MOVMSKB, taking into account whether we need to split for AVX1. | |||
| 32932 | static SDValue getPMOVMSKB(const SDLoc &DL, SDValue V, SelectionDAG &DAG, | |||
| 32933 | const X86Subtarget &Subtarget) { | |||
| 32934 | MVT InVT = V.getSimpleValueType(); | |||
| 32935 | ||||
| 32936 | if (InVT == MVT::v64i8) { | |||
| 32937 | SDValue Lo, Hi; | |||
| 32938 | std::tie(Lo, Hi) = DAG.SplitVector(V, DL); | |||
| 32939 | Lo = getPMOVMSKB(DL, Lo, DAG, Subtarget); | |||
| 32940 | Hi = getPMOVMSKB(DL, Hi, DAG, Subtarget); | |||
| 32941 | Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Lo); | |||
| 32942 | Hi = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, Hi); | |||
| 32943 | Hi = DAG.getNode(ISD::SHL, DL, MVT::i64, Hi, | |||
| 32944 | DAG.getConstant(32, DL, MVT::i8)); | |||
| 32945 | return DAG.getNode(ISD::OR, DL, MVT::i64, Lo, Hi); | |||
| 32946 | } | |||
| 32947 | if (InVT == MVT::v32i8 && !Subtarget.hasInt256()) { | |||
| 32948 | SDValue Lo, Hi; | |||
| 32949 | std::tie(Lo, Hi) = DAG.SplitVector(V, DL); | |||
| 32950 | Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); | |||
| 32951 | Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); | |||
| 32952 | Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, | |||
| 32953 | DAG.getConstant(16, DL, MVT::i8)); | |||
| 32954 | return DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi); | |||
| 32955 | } | |||
| 32956 | ||||
| 32957 | return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); | |||
| 32958 | } | |||
| 32959 | ||||
| 32960 | static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget, | |||
| 32961 | SelectionDAG &DAG) { | |||
| 32962 | SDValue Src = Op.getOperand(0); | |||
| 32963 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 32964 | MVT DstVT = Op.getSimpleValueType(); | |||
| 32965 | ||||
| 32966 | // Legalize (v64i1 (bitcast i64 (X))) by splitting the i64, bitcasting each | |||
| 32967 | // half to v32i1 and concatenating the result. | |||
| 32968 | if (SrcVT == MVT::i64 && DstVT == MVT::v64i1) { | |||
| 32969 | assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32969, __extension__ __PRETTY_FUNCTION__)); | |||
| 32970 | assert(Subtarget.hasBWI() && "Expected BWI target")(static_cast <bool> (Subtarget.hasBWI() && "Expected BWI target" ) ? void (0) : __assert_fail ("Subtarget.hasBWI() && \"Expected BWI target\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32970, __extension__ __PRETTY_FUNCTION__)); | |||
| 32971 | SDLoc dl(Op); | |||
| 32972 | SDValue Lo, Hi; | |||
| 32973 | std::tie(Lo, Hi) = DAG.SplitScalar(Src, dl, MVT::i32, MVT::i32); | |||
| 32974 | Lo = DAG.getBitcast(MVT::v32i1, Lo); | |||
| 32975 | Hi = DAG.getBitcast(MVT::v32i1, Hi); | |||
| 32976 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi); | |||
| 32977 | } | |||
| 32978 | ||||
| 32979 | // Use MOVMSK for vector to scalar conversion to prevent scalarization. | |||
| 32980 | if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) { | |||
| 32981 | assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512")(static_cast <bool> (!Subtarget.hasAVX512() && "Should use K-registers with AVX512" ) ? void (0) : __assert_fail ("!Subtarget.hasAVX512() && \"Should use K-registers with AVX512\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32981, __extension__ __PRETTY_FUNCTION__)); | |||
| 32982 | MVT SExtVT = SrcVT == MVT::v16i1 ? MVT::v16i8 : MVT::v32i8; | |||
| 32983 | SDLoc DL(Op); | |||
| 32984 | SDValue V = DAG.getSExtOrTrunc(Src, DL, SExtVT); | |||
| 32985 | V = getPMOVMSKB(DL, V, DAG, Subtarget); | |||
| 32986 | return DAG.getZExtOrTrunc(V, DL, DstVT); | |||
| 32987 | } | |||
| 32988 | ||||
| 32989 | assert((SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT ::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__ __PRETTY_FUNCTION__)) | |||
| 32990 | SrcVT == MVT::i64) && "Unexpected VT!")(static_cast <bool> ((SrcVT == MVT::v2i32 || SrcVT == MVT ::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && "Unexpected VT!") ? void (0) : __assert_fail ("(SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 || SrcVT == MVT::i64) && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32990, __extension__ __PRETTY_FUNCTION__)); | |||
| 32991 | ||||
| 32992 | assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 32992, __extension__ __PRETTY_FUNCTION__)); | |||
| 32993 | if (!(DstVT == MVT::f64 && SrcVT == MVT::i64) && | |||
| 32994 | !(DstVT == MVT::x86mmx && SrcVT.isVector())) | |||
| 32995 | // This conversion needs to be expanded. | |||
| 32996 | return SDValue(); | |||
| 32997 | ||||
| 32998 | SDLoc dl(Op); | |||
| 32999 | if (SrcVT.isVector()) { | |||
| 33000 | // Widen the vector in input in the case of MVT::v2i32. | |||
| 33001 | // Example: from MVT::v2i32 to MVT::v4i32. | |||
| 33002 | MVT NewVT = MVT::getVectorVT(SrcVT.getVectorElementType(), | |||
| 33003 | SrcVT.getVectorNumElements() * 2); | |||
| 33004 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewVT, Src, | |||
| 33005 | DAG.getUNDEF(SrcVT)); | |||
| 33006 | } else { | |||
| 33007 | assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget .is64Bit() && "Unexpected source type in LowerBITCAST" ) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__ __PRETTY_FUNCTION__)) | |||
| 33008 | "Unexpected source type in LowerBITCAST")(static_cast <bool> (SrcVT == MVT::i64 && !Subtarget .is64Bit() && "Unexpected source type in LowerBITCAST" ) ? void (0) : __assert_fail ("SrcVT == MVT::i64 && !Subtarget.is64Bit() && \"Unexpected source type in LowerBITCAST\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33008, __extension__ __PRETTY_FUNCTION__)); | |||
| 33009 | Src = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Src); | |||
| 33010 | } | |||
| 33011 | ||||
| 33012 | MVT V2X64VT = DstVT == MVT::f64 ? MVT::v2f64 : MVT::v2i64; | |||
| 33013 | Src = DAG.getNode(ISD::BITCAST, dl, V2X64VT, Src); | |||
| 33014 | ||||
| 33015 | if (DstVT == MVT::x86mmx) | |||
| 33016 | return DAG.getNode(X86ISD::MOVDQ2Q, dl, DstVT, Src); | |||
| 33017 | ||||
| 33018 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, DstVT, Src, | |||
| 33019 | DAG.getIntPtrConstant(0, dl)); | |||
| 33020 | } | |||
| 33021 | ||||
| 33022 | /// Compute the horizontal sum of bytes in V for the elements of VT. | |||
| 33023 | /// | |||
| 33024 | /// Requires V to be a byte vector and VT to be an integer vector type with | |||
| 33025 | /// wider elements than V's type. The width of the elements of VT determines | |||
| 33026 | /// how many bytes of V are summed horizontally to produce each element of the | |||
| 33027 | /// result. | |||
| 33028 | static SDValue LowerHorizontalByteSum(SDValue V, MVT VT, | |||
| 33029 | const X86Subtarget &Subtarget, | |||
| 33030 | SelectionDAG &DAG) { | |||
| 33031 | SDLoc DL(V); | |||
| 33032 | MVT ByteVecVT = V.getSimpleValueType(); | |||
| 33033 | MVT EltVT = VT.getVectorElementType(); | |||
| 33034 | assert(ByteVecVT.getVectorElementType() == MVT::i8 &&(static_cast <bool> (ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type." ) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__ __PRETTY_FUNCTION__)) | |||
| 33035 | "Expected value to have byte element type.")(static_cast <bool> (ByteVecVT.getVectorElementType() == MVT::i8 && "Expected value to have byte element type." ) ? void (0) : __assert_fail ("ByteVecVT.getVectorElementType() == MVT::i8 && \"Expected value to have byte element type.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33035, __extension__ __PRETTY_FUNCTION__)); | |||
| 33036 | assert(EltVT != MVT::i8 &&(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!" ) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__ __PRETTY_FUNCTION__)) | |||
| 33037 | "Horizontal byte sum only makes sense for wider elements!")(static_cast <bool> (EltVT != MVT::i8 && "Horizontal byte sum only makes sense for wider elements!" ) ? void (0) : __assert_fail ("EltVT != MVT::i8 && \"Horizontal byte sum only makes sense for wider elements!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33037, __extension__ __PRETTY_FUNCTION__)); | |||
| 33038 | unsigned VecSize = VT.getSizeInBits(); | |||
| 33039 | assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!")(static_cast <bool> (ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!") ? void (0) : __assert_fail ("ByteVecVT.getSizeInBits() == VecSize && \"Cannot change vector size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33039, __extension__ __PRETTY_FUNCTION__)); | |||
| 33040 | ||||
| 33041 | // PSADBW instruction horizontally add all bytes and leave the result in i64 | |||
| 33042 | // chunks, thus directly computes the pop count for v2i64 and v4i64. | |||
| 33043 | if (EltVT == MVT::i64) { | |||
| 33044 | SDValue Zeros = DAG.getConstant(0, DL, ByteVecVT); | |||
| 33045 | MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); | |||
| 33046 | V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros); | |||
| 33047 | return DAG.getBitcast(VT, V); | |||
| 33048 | } | |||
| 33049 | ||||
| 33050 | if (EltVT == MVT::i32) { | |||
| 33051 | // We unpack the low half and high half into i32s interleaved with zeros so | |||
| 33052 | // that we can use PSADBW to horizontally sum them. The most useful part of | |||
| 33053 | // this is that it lines up the results of two PSADBW instructions to be | |||
| 33054 | // two v2i64 vectors which concatenated are the 4 population counts. We can | |||
| 33055 | // then use PACKUSWB to shrink and concatenate them into a v4i32 again. | |||
| 33056 | SDValue Zeros = DAG.getConstant(0, DL, VT); | |||
| 33057 | SDValue V32 = DAG.getBitcast(VT, V); | |||
| 33058 | SDValue Low = getUnpackl(DAG, DL, VT, V32, Zeros); | |||
| 33059 | SDValue High = getUnpackh(DAG, DL, VT, V32, Zeros); | |||
| 33060 | ||||
| 33061 | // Do the horizontal sums into two v2i64s. | |||
| 33062 | Zeros = DAG.getConstant(0, DL, ByteVecVT); | |||
| 33063 | MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64); | |||
| 33064 | Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, | |||
| 33065 | DAG.getBitcast(ByteVecVT, Low), Zeros); | |||
| 33066 | High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, | |||
| 33067 | DAG.getBitcast(ByteVecVT, High), Zeros); | |||
| 33068 | ||||
| 33069 | // Merge them together. | |||
| 33070 | MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16); | |||
| 33071 | V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT, | |||
| 33072 | DAG.getBitcast(ShortVecVT, Low), | |||
| 33073 | DAG.getBitcast(ShortVecVT, High)); | |||
| 33074 | ||||
| 33075 | return DAG.getBitcast(VT, V); | |||
| 33076 | } | |||
| 33077 | ||||
| 33078 | // The only element type left is i16. | |||
| 33079 | assert(EltVT == MVT::i16 && "Unknown how to handle type")(static_cast <bool> (EltVT == MVT::i16 && "Unknown how to handle type" ) ? void (0) : __assert_fail ("EltVT == MVT::i16 && \"Unknown how to handle type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33079, __extension__ __PRETTY_FUNCTION__)); | |||
| 33080 | ||||
| 33081 | // To obtain pop count for each i16 element starting from the pop count for | |||
| 33082 | // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s | |||
| 33083 | // right by 8. It is important to shift as i16s as i8 vector shift isn't | |||
| 33084 | // directly supported. | |||
| 33085 | SDValue ShifterV = DAG.getConstant(8, DL, VT); | |||
| 33086 | SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV); | |||
| 33087 | V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl), | |||
| 33088 | DAG.getBitcast(ByteVecVT, V)); | |||
| 33089 | return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV); | |||
| 33090 | } | |||
| 33091 | ||||
| 33092 | static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL, | |||
| 33093 | const X86Subtarget &Subtarget, | |||
| 33094 | SelectionDAG &DAG) { | |||
| 33095 | MVT VT = Op.getSimpleValueType(); | |||
| 33096 | MVT EltVT = VT.getVectorElementType(); | |||
| 33097 | int NumElts = VT.getVectorNumElements(); | |||
| 33098 | (void)EltVT; | |||
| 33099 | assert(EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported.")(static_cast <bool> (EltVT == MVT::i8 && "Only vXi8 vector CTPOP lowering supported." ) ? void (0) : __assert_fail ("EltVT == MVT::i8 && \"Only vXi8 vector CTPOP lowering supported.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33099, __extension__ __PRETTY_FUNCTION__)); | |||
| 33100 | ||||
| 33101 | // Implement a lookup table in register by using an algorithm based on: | |||
| 33102 | // http://wm.ite.pl/articles/sse-popcount.html | |||
| 33103 | // | |||
| 33104 | // The general idea is that every lower byte nibble in the input vector is an | |||
| 33105 | // index into a in-register pre-computed pop count table. We then split up the | |||
| 33106 | // input vector in two new ones: (1) a vector with only the shifted-right | |||
| 33107 | // higher nibbles for each byte and (2) a vector with the lower nibbles (and | |||
| 33108 | // masked out higher ones) for each byte. PSHUFB is used separately with both | |||
| 33109 | // to index the in-register table. Next, both are added and the result is a | |||
| 33110 | // i8 vector where each element contains the pop count for input byte. | |||
| 33111 | const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2, | |||
| 33112 | /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3, | |||
| 33113 | /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3, | |||
| 33114 | /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4}; | |||
| 33115 | ||||
| 33116 | SmallVector<SDValue, 64> LUTVec; | |||
| 33117 | for (int i = 0; i < NumElts; ++i) | |||
| 33118 | LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8)); | |||
| 33119 | SDValue InRegLUT = DAG.getBuildVector(VT, DL, LUTVec); | |||
| 33120 | SDValue M0F = DAG.getConstant(0x0F, DL, VT); | |||
| 33121 | ||||
| 33122 | // High nibbles | |||
| 33123 | SDValue FourV = DAG.getConstant(4, DL, VT); | |||
| 33124 | SDValue HiNibbles = DAG.getNode(ISD::SRL, DL, VT, Op, FourV); | |||
| 33125 | ||||
| 33126 | // Low nibbles | |||
| 33127 | SDValue LoNibbles = DAG.getNode(ISD::AND, DL, VT, Op, M0F); | |||
| 33128 | ||||
| 33129 | // The input vector is used as the shuffle mask that index elements into the | |||
| 33130 | // LUT. After counting low and high nibbles, add the vector to obtain the | |||
| 33131 | // final pop count per i8 element. | |||
| 33132 | SDValue HiPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, HiNibbles); | |||
| 33133 | SDValue LoPopCnt = DAG.getNode(X86ISD::PSHUFB, DL, VT, InRegLUT, LoNibbles); | |||
| 33134 | return DAG.getNode(ISD::ADD, DL, VT, HiPopCnt, LoPopCnt); | |||
| 33135 | } | |||
| 33136 | ||||
| 33137 | // Please ensure that any codegen change from LowerVectorCTPOP is reflected in | |||
| 33138 | // updated cost models in X86TTIImpl::getIntrinsicInstrCost. | |||
| 33139 | static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33140 | SelectionDAG &DAG) { | |||
| 33141 | MVT VT = Op.getSimpleValueType(); | |||
| 33142 | assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector () || VT.is128BitVector()) && "Unknown CTPOP type to handle" ) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__ __PRETTY_FUNCTION__)) | |||
| 33143 | "Unknown CTPOP type to handle")(static_cast <bool> ((VT.is512BitVector() || VT.is256BitVector () || VT.is128BitVector()) && "Unknown CTPOP type to handle" ) ? void (0) : __assert_fail ("(VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) && \"Unknown CTPOP type to handle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33143, __extension__ __PRETTY_FUNCTION__)); | |||
| 33144 | SDLoc DL(Op.getNode()); | |||
| 33145 | SDValue Op0 = Op.getOperand(0); | |||
| 33146 | ||||
| 33147 | // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. | |||
| 33148 | if (Subtarget.hasVPOPCNTDQ()) { | |||
| 33149 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 33150 | assert((VT.getVectorElementType() == MVT::i8 ||(static_cast <bool> ((VT.getVectorElementType() == MVT:: i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type" ) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__ __PRETTY_FUNCTION__)) | |||
| 33151 | VT.getVectorElementType() == MVT::i16) && "Unexpected type")(static_cast <bool> ((VT.getVectorElementType() == MVT:: i8 || VT.getVectorElementType() == MVT::i16) && "Unexpected type" ) ? void (0) : __assert_fail ("(VT.getVectorElementType() == MVT::i8 || VT.getVectorElementType() == MVT::i16) && \"Unexpected type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33151, __extension__ __PRETTY_FUNCTION__)); | |||
| 33152 | if (NumElems < 16 || (NumElems == 16 && Subtarget.canExtendTo512DQ())) { | |||
| 33153 | MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); | |||
| 33154 | Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); | |||
| 33155 | Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op); | |||
| 33156 | return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); | |||
| 33157 | } | |||
| 33158 | } | |||
| 33159 | ||||
| 33160 | // Decompose 256-bit ops into smaller 128-bit ops. | |||
| 33161 | if (VT.is256BitVector() && !Subtarget.hasInt256()) | |||
| 33162 | return splitVectorIntUnary(Op, DAG); | |||
| 33163 | ||||
| 33164 | // Decompose 512-bit ops into smaller 256-bit ops. | |||
| 33165 | if (VT.is512BitVector() && !Subtarget.hasBWI()) | |||
| 33166 | return splitVectorIntUnary(Op, DAG); | |||
| 33167 | ||||
| 33168 | // For element types greater than i8, do vXi8 pop counts and a bytesum. | |||
| 33169 | if (VT.getScalarType() != MVT::i8) { | |||
| 33170 | MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8); | |||
| 33171 | SDValue ByteOp = DAG.getBitcast(ByteVT, Op0); | |||
| 33172 | SDValue PopCnt8 = DAG.getNode(ISD::CTPOP, DL, ByteVT, ByteOp); | |||
| 33173 | return LowerHorizontalByteSum(PopCnt8, VT, Subtarget, DAG); | |||
| 33174 | } | |||
| 33175 | ||||
| 33176 | // We can't use the fast LUT approach, so fall back on LegalizeDAG. | |||
| 33177 | if (!Subtarget.hasSSSE3()) | |||
| 33178 | return SDValue(); | |||
| 33179 | ||||
| 33180 | return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG); | |||
| 33181 | } | |||
| 33182 | ||||
| 33183 | static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33184 | SelectionDAG &DAG) { | |||
| 33185 | assert(Op.getSimpleValueType().isVector() &&(static_cast <bool> (Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count." ) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__ __PRETTY_FUNCTION__)) | |||
| 33186 | "We only do custom lowering for vector population count.")(static_cast <bool> (Op.getSimpleValueType().isVector() && "We only do custom lowering for vector population count." ) ? void (0) : __assert_fail ("Op.getSimpleValueType().isVector() && \"We only do custom lowering for vector population count.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33186, __extension__ __PRETTY_FUNCTION__)); | |||
| 33187 | return LowerVectorCTPOP(Op, Subtarget, DAG); | |||
| 33188 | } | |||
| 33189 | ||||
| 33190 | static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { | |||
| 33191 | MVT VT = Op.getSimpleValueType(); | |||
| 33192 | SDValue In = Op.getOperand(0); | |||
| 33193 | SDLoc DL(Op); | |||
| 33194 | ||||
| 33195 | // For scalars, its still beneficial to transfer to/from the SIMD unit to | |||
| 33196 | // perform the BITREVERSE. | |||
| 33197 | if (!VT.isVector()) { | |||
| 33198 | MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits()); | |||
| 33199 | SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In); | |||
| 33200 | Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res); | |||
| 33201 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res, | |||
| 33202 | DAG.getIntPtrConstant(0, DL)); | |||
| 33203 | } | |||
| 33204 | ||||
| 33205 | int NumElts = VT.getVectorNumElements(); | |||
| 33206 | int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8; | |||
| 33207 | ||||
| 33208 | // Decompose 256-bit ops into smaller 128-bit ops. | |||
| 33209 | if (VT.is256BitVector()) | |||
| 33210 | return splitVectorIntUnary(Op, DAG); | |||
| 33211 | ||||
| 33212 | assert(VT.is128BitVector() &&(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported." ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__ __PRETTY_FUNCTION__)) | |||
| 33213 | "Only 128-bit vector bitreverse lowering supported.")(static_cast <bool> (VT.is128BitVector() && "Only 128-bit vector bitreverse lowering supported." ) ? void (0) : __assert_fail ("VT.is128BitVector() && \"Only 128-bit vector bitreverse lowering supported.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33213, __extension__ __PRETTY_FUNCTION__)); | |||
| 33214 | ||||
| 33215 | // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we | |||
| 33216 | // perform the BSWAP in the shuffle. | |||
| 33217 | // Its best to shuffle using the second operand as this will implicitly allow | |||
| 33218 | // memory folding for multiple vectors. | |||
| 33219 | SmallVector<SDValue, 16> MaskElts; | |||
| 33220 | for (int i = 0; i != NumElts; ++i) { | |||
| 33221 | for (int j = ScalarSizeInBytes - 1; j >= 0; --j) { | |||
| 33222 | int SourceByte = 16 + (i * ScalarSizeInBytes) + j; | |||
| 33223 | int PermuteByte = SourceByte | (2 << 5); | |||
| 33224 | MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8)); | |||
| 33225 | } | |||
| 33226 | } | |||
| 33227 | ||||
| 33228 | SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts); | |||
| 33229 | SDValue Res = DAG.getBitcast(MVT::v16i8, In); | |||
| 33230 | Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8), | |||
| 33231 | Res, Mask); | |||
| 33232 | return DAG.getBitcast(VT, Res); | |||
| 33233 | } | |||
| 33234 | ||||
| 33235 | static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33236 | SelectionDAG &DAG) { | |||
| 33237 | MVT VT = Op.getSimpleValueType(); | |||
| 33238 | ||||
| 33239 | if (Subtarget.hasXOP() && !VT.is512BitVector()) | |||
| 33240 | return LowerBITREVERSE_XOP(Op, DAG); | |||
| 33241 | ||||
| 33242 | assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE")(static_cast <bool> (Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE" ) ? void (0) : __assert_fail ("Subtarget.hasSSSE3() && \"SSSE3 required for BITREVERSE\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33242, __extension__ __PRETTY_FUNCTION__)); | |||
| 33243 | ||||
| 33244 | SDValue In = Op.getOperand(0); | |||
| 33245 | SDLoc DL(Op); | |||
| 33246 | ||||
| 33247 | assert(VT.getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__ __PRETTY_FUNCTION__)) | |||
| 33248 | "Only byte vector BITREVERSE supported")(static_cast <bool> (VT.getScalarType() == MVT::i8 && "Only byte vector BITREVERSE supported") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i8 && \"Only byte vector BITREVERSE supported\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33248, __extension__ __PRETTY_FUNCTION__)); | |||
| 33249 | ||||
| 33250 | // Split v64i8 without BWI so that we can still use the PSHUFB lowering. | |||
| 33251 | if (VT == MVT::v64i8 && !Subtarget.hasBWI()) | |||
| 33252 | return splitVectorIntUnary(Op, DAG); | |||
| 33253 | ||||
| 33254 | // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2. | |||
| 33255 | if (VT == MVT::v32i8 && !Subtarget.hasInt256()) | |||
| 33256 | return splitVectorIntUnary(Op, DAG); | |||
| 33257 | ||||
| 33258 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 33259 | ||||
| 33260 | // If we have GFNI, we can use GF2P8AFFINEQB to reverse the bits. | |||
| 33261 | if (Subtarget.hasGFNI()) { | |||
| 33262 | MVT MatrixVT = MVT::getVectorVT(MVT::i64, NumElts / 8); | |||
| 33263 | SDValue Matrix = DAG.getConstant(0x8040201008040201ULL, DL, MatrixVT); | |||
| 33264 | Matrix = DAG.getBitcast(VT, Matrix); | |||
| 33265 | return DAG.getNode(X86ISD::GF2P8AFFINEQB, DL, VT, In, Matrix, | |||
| 33266 | DAG.getTargetConstant(0, DL, MVT::i8)); | |||
| 33267 | } | |||
| 33268 | ||||
| 33269 | // Perform BITREVERSE using PSHUFB lookups. Each byte is split into | |||
| 33270 | // two nibbles and a PSHUFB lookup to find the bitreverse of each | |||
| 33271 | // 0-15 value (moved to the other nibble). | |||
| 33272 | SDValue NibbleMask = DAG.getConstant(0xF, DL, VT); | |||
| 33273 | SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask); | |||
| 33274 | SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT)); | |||
| 33275 | ||||
| 33276 | const int LoLUT[16] = { | |||
| 33277 | /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0, | |||
| 33278 | /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0, | |||
| 33279 | /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0, | |||
| 33280 | /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0}; | |||
| 33281 | const int HiLUT[16] = { | |||
| 33282 | /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C, | |||
| 33283 | /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E, | |||
| 33284 | /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D, | |||
| 33285 | /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F}; | |||
| 33286 | ||||
| 33287 | SmallVector<SDValue, 16> LoMaskElts, HiMaskElts; | |||
| 33288 | for (unsigned i = 0; i < NumElts; ++i) { | |||
| 33289 | LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8)); | |||
| 33290 | HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8)); | |||
| 33291 | } | |||
| 33292 | ||||
| 33293 | SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts); | |||
| 33294 | SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts); | |||
| 33295 | Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo); | |||
| 33296 | Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi); | |||
| 33297 | return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); | |||
| 33298 | } | |||
| 33299 | ||||
| 33300 | static SDValue LowerPARITY(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33301 | SelectionDAG &DAG) { | |||
| 33302 | SDLoc DL(Op); | |||
| 33303 | SDValue X = Op.getOperand(0); | |||
| 33304 | MVT VT = Op.getSimpleValueType(); | |||
| 33305 | ||||
| 33306 | // Special case. If the input fits in 8-bits we can use a single 8-bit TEST. | |||
| 33307 | if (VT == MVT::i8 || | |||
| 33308 | DAG.MaskedValueIsZero(X, APInt::getBitsSetFrom(VT.getSizeInBits(), 8))) { | |||
| 33309 | X = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); | |||
| 33310 | SDValue Flags = DAG.getNode(X86ISD::CMP, DL, MVT::i32, X, | |||
| 33311 | DAG.getConstant(0, DL, MVT::i8)); | |||
| 33312 | // Copy the inverse of the parity flag into a register with setcc. | |||
| 33313 | SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); | |||
| 33314 | // Extend to the original type. | |||
| 33315 | return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); | |||
| 33316 | } | |||
| 33317 | ||||
| 33318 | // If we have POPCNT, use the default expansion. | |||
| 33319 | if (Subtarget.hasPOPCNT()) | |||
| 33320 | return SDValue(); | |||
| 33321 | ||||
| 33322 | if (VT == MVT::i64) { | |||
| 33323 | // Xor the high and low 16-bits together using a 32-bit operation. | |||
| 33324 | SDValue Hi = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, | |||
| 33325 | DAG.getNode(ISD::SRL, DL, MVT::i64, X, | |||
| 33326 | DAG.getConstant(32, DL, MVT::i8))); | |||
| 33327 | SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, X); | |||
| 33328 | X = DAG.getNode(ISD::XOR, DL, MVT::i32, Lo, Hi); | |||
| 33329 | } | |||
| 33330 | ||||
| 33331 | if (VT != MVT::i16) { | |||
| 33332 | // Xor the high and low 16-bits together using a 32-bit operation. | |||
| 33333 | SDValue Hi16 = DAG.getNode(ISD::SRL, DL, MVT::i32, X, | |||
| 33334 | DAG.getConstant(16, DL, MVT::i8)); | |||
| 33335 | X = DAG.getNode(ISD::XOR, DL, MVT::i32, X, Hi16); | |||
| 33336 | } else { | |||
| 33337 | // If the input is 16-bits, we need to extend to use an i32 shift below. | |||
| 33338 | X = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, X); | |||
| 33339 | } | |||
| 33340 | ||||
| 33341 | // Finally xor the low 2 bytes together and use a 8-bit flag setting xor. | |||
| 33342 | // This should allow an h-reg to be used to save a shift. | |||
| 33343 | SDValue Hi = DAG.getNode( | |||
| 33344 | ISD::TRUNCATE, DL, MVT::i8, | |||
| 33345 | DAG.getNode(ISD::SRL, DL, MVT::i32, X, DAG.getConstant(8, DL, MVT::i8))); | |||
| 33346 | SDValue Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, X); | |||
| 33347 | SDVTList VTs = DAG.getVTList(MVT::i8, MVT::i32); | |||
| 33348 | SDValue Flags = DAG.getNode(X86ISD::XOR, DL, VTs, Lo, Hi).getValue(1); | |||
| 33349 | ||||
| 33350 | // Copy the inverse of the parity flag into a register with setcc. | |||
| 33351 | SDValue Setnp = getSETCC(X86::COND_NP, Flags, DL, DAG); | |||
| 33352 | // Extend to the original type. | |||
| 33353 | return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Setnp); | |||
| 33354 | } | |||
| 33355 | ||||
| 33356 | static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, | |||
| 33357 | const X86Subtarget &Subtarget) { | |||
| 33358 | unsigned NewOpc = 0; | |||
| 33359 | switch (N->getOpcode()) { | |||
| 33360 | case ISD::ATOMIC_LOAD_ADD: | |||
| 33361 | NewOpc = X86ISD::LADD; | |||
| 33362 | break; | |||
| 33363 | case ISD::ATOMIC_LOAD_SUB: | |||
| 33364 | NewOpc = X86ISD::LSUB; | |||
| 33365 | break; | |||
| 33366 | case ISD::ATOMIC_LOAD_OR: | |||
| 33367 | NewOpc = X86ISD::LOR; | |||
| 33368 | break; | |||
| 33369 | case ISD::ATOMIC_LOAD_XOR: | |||
| 33370 | NewOpc = X86ISD::LXOR; | |||
| 33371 | break; | |||
| 33372 | case ISD::ATOMIC_LOAD_AND: | |||
| 33373 | NewOpc = X86ISD::LAND; | |||
| 33374 | break; | |||
| 33375 | default: | |||
| 33376 | llvm_unreachable("Unknown ATOMIC_LOAD_ opcode")::llvm::llvm_unreachable_internal("Unknown ATOMIC_LOAD_ opcode" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33376); | |||
| 33377 | } | |||
| 33378 | ||||
| 33379 | MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); | |||
| 33380 | ||||
| 33381 | return DAG.getMemIntrinsicNode( | |||
| 33382 | NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), | |||
| 33383 | {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, | |||
| 33384 | /*MemVT=*/N->getSimpleValueType(0), MMO); | |||
| 33385 | } | |||
| 33386 | ||||
| 33387 | /// Lower atomic_load_ops into LOCK-prefixed operations. | |||
| 33388 | static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, | |||
| 33389 | const X86Subtarget &Subtarget) { | |||
| 33390 | AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode()); | |||
| 33391 | SDValue Chain = N->getOperand(0); | |||
| 33392 | SDValue LHS = N->getOperand(1); | |||
| 33393 | SDValue RHS = N->getOperand(2); | |||
| 33394 | unsigned Opc = N->getOpcode(); | |||
| 33395 | MVT VT = N->getSimpleValueType(0); | |||
| 33396 | SDLoc DL(N); | |||
| 33397 | ||||
| 33398 | // We can lower atomic_load_add into LXADD. However, any other atomicrmw op | |||
| 33399 | // can only be lowered when the result is unused. They should have already | |||
| 33400 | // been transformed into a cmpxchg loop in AtomicExpand. | |||
| 33401 | if (N->hasAnyUseOfValue(0)) { | |||
| 33402 | // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to | |||
| 33403 | // select LXADD if LOCK_SUB can't be selected. | |||
| 33404 | // Handle (atomic_load_xor p, SignBit) as (atomic_load_add p, SignBit) so we | |||
| 33405 | // can use LXADD as opposed to cmpxchg. | |||
| 33406 | if (Opc == ISD::ATOMIC_LOAD_SUB || | |||
| 33407 | (Opc == ISD::ATOMIC_LOAD_XOR && isMinSignedConstant(RHS))) { | |||
| 33408 | RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS); | |||
| 33409 | return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS, RHS, | |||
| 33410 | AN->getMemOperand()); | |||
| 33411 | } | |||
| 33412 | assert(Opc == ISD::ATOMIC_LOAD_ADD &&(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!" ) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__ __PRETTY_FUNCTION__)) | |||
| 33413 | "Used AtomicRMW ops other than Add should have been expanded!")(static_cast <bool> (Opc == ISD::ATOMIC_LOAD_ADD && "Used AtomicRMW ops other than Add should have been expanded!" ) ? void (0) : __assert_fail ("Opc == ISD::ATOMIC_LOAD_ADD && \"Used AtomicRMW ops other than Add should have been expanded!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33413, __extension__ __PRETTY_FUNCTION__)); | |||
| 33414 | return N; | |||
| 33415 | } | |||
| 33416 | ||||
| 33417 | // Specialized lowering for the canonical form of an idemptotent atomicrmw. | |||
| 33418 | // The core idea here is that since the memory location isn't actually | |||
| 33419 | // changing, all we need is a lowering for the *ordering* impacts of the | |||
| 33420 | // atomicrmw. As such, we can chose a different operation and memory | |||
| 33421 | // location to minimize impact on other code. | |||
| 33422 | if (Opc == ISD::ATOMIC_LOAD_OR && isNullConstant(RHS)) { | |||
| 33423 | // On X86, the only ordering which actually requires an instruction is | |||
| 33424 | // seq_cst which isn't SingleThread, everything just needs to be preserved | |||
| 33425 | // during codegen and then dropped. Note that we expect (but don't assume), | |||
| 33426 | // that orderings other than seq_cst and acq_rel have been canonicalized to | |||
| 33427 | // a store or load. | |||
| 33428 | if (AN->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent && | |||
| 33429 | AN->getSyncScopeID() == SyncScope::System) { | |||
| 33430 | // Prefer a locked operation against a stack location to minimize cache | |||
| 33431 | // traffic. This assumes that stack locations are very likely to be | |||
| 33432 | // accessed only by the owning thread. | |||
| 33433 | SDValue NewChain = emitLockedStackOp(DAG, Subtarget, Chain, DL); | |||
| 33434 | assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void (0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 33434, __extension__ __PRETTY_FUNCTION__)); | |||
| 33435 | // NOTE: The getUNDEF is needed to give something for the unused result 0. | |||
| 33436 | return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), | |||
| 33437 | DAG.getUNDEF(VT), NewChain); | |||
| 33438 | } | |||
| 33439 | // MEMBARRIER is a compiler barrier; it codegens to a no-op. | |||
| 33440 | SDValue NewChain = DAG.getNode(ISD::MEMBARRIER, DL, MVT::Other, Chain); | |||
| 33441 | assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void (0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 33441, __extension__ __PRETTY_FUNCTION__)); | |||
| 33442 | // NOTE: The getUNDEF is needed to give something for the unused result 0. | |||
| 33443 | return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), | |||
| 33444 | DAG.getUNDEF(VT), NewChain); | |||
| 33445 | } | |||
| 33446 | ||||
| 33447 | SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); | |||
| 33448 | // RAUW the chain, but don't worry about the result, as it's unused. | |||
| 33449 | assert(!N->hasAnyUseOfValue(0))(static_cast <bool> (!N->hasAnyUseOfValue(0)) ? void (0) : __assert_fail ("!N->hasAnyUseOfValue(0)", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 33449, __extension__ __PRETTY_FUNCTION__)); | |||
| 33450 | // NOTE: The getUNDEF is needed to give something for the unused result 0. | |||
| 33451 | return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), | |||
| 33452 | DAG.getUNDEF(VT), LockOp.getValue(1)); | |||
| 33453 | } | |||
| 33454 | ||||
| 33455 | static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG, | |||
| 33456 | const X86Subtarget &Subtarget) { | |||
| 33457 | auto *Node = cast<AtomicSDNode>(Op.getNode()); | |||
| 33458 | SDLoc dl(Node); | |||
| 33459 | EVT VT = Node->getMemoryVT(); | |||
| 33460 | ||||
| 33461 | bool IsSeqCst = | |||
| 33462 | Node->getSuccessOrdering() == AtomicOrdering::SequentiallyConsistent; | |||
| 33463 | bool IsTypeLegal = DAG.getTargetLoweringInfo().isTypeLegal(VT); | |||
| 33464 | ||||
| 33465 | // If this store is not sequentially consistent and the type is legal | |||
| 33466 | // we can just keep it. | |||
| 33467 | if (!IsSeqCst && IsTypeLegal) | |||
| 33468 | return Op; | |||
| 33469 | ||||
| 33470 | if (VT == MVT::i64 && !IsTypeLegal) { | |||
| 33471 | // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE | |||
| 33472 | // is enabled. | |||
| 33473 | bool NoImplicitFloatOps = | |||
| 33474 | DAG.getMachineFunction().getFunction().hasFnAttribute( | |||
| 33475 | Attribute::NoImplicitFloat); | |||
| 33476 | if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { | |||
| 33477 | SDValue Chain; | |||
| 33478 | if (Subtarget.hasSSE1()) { | |||
| 33479 | SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, | |||
| 33480 | Node->getOperand(2)); | |||
| 33481 | MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; | |||
| 33482 | SclToVec = DAG.getBitcast(StVT, SclToVec); | |||
| 33483 | SDVTList Tys = DAG.getVTList(MVT::Other); | |||
| 33484 | SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()}; | |||
| 33485 | Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops, | |||
| 33486 | MVT::i64, Node->getMemOperand()); | |||
| 33487 | } else if (Subtarget.hasX87()) { | |||
| 33488 | // First load this into an 80-bit X87 register using a stack temporary. | |||
| 33489 | // This will put the whole integer into the significand. | |||
| 33490 | SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); | |||
| 33491 | int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); | |||
| 33492 | MachinePointerInfo MPI = | |||
| 33493 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); | |||
| 33494 | Chain = | |||
| 33495 | DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr, | |||
| 33496 | MPI, MaybeAlign(), MachineMemOperand::MOStore); | |||
| 33497 | SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 33498 | SDValue LdOps[] = {Chain, StackPtr}; | |||
| 33499 | SDValue Value = DAG.getMemIntrinsicNode( | |||
| 33500 | X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI, | |||
| 33501 | /*Align*/ std::nullopt, MachineMemOperand::MOLoad); | |||
| 33502 | Chain = Value.getValue(1); | |||
| 33503 | ||||
| 33504 | // Now use an FIST to do the atomic store. | |||
| 33505 | SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()}; | |||
| 33506 | Chain = | |||
| 33507 | DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other), | |||
| 33508 | StoreOps, MVT::i64, Node->getMemOperand()); | |||
| 33509 | } | |||
| 33510 | ||||
| 33511 | if (Chain) { | |||
| 33512 | // If this is a sequentially consistent store, also emit an appropriate | |||
| 33513 | // barrier. | |||
| 33514 | if (IsSeqCst) | |||
| 33515 | Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl); | |||
| 33516 | ||||
| 33517 | return Chain; | |||
| 33518 | } | |||
| 33519 | } | |||
| 33520 | } | |||
| 33521 | ||||
| 33522 | // Convert seq_cst store -> xchg | |||
| 33523 | // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b) | |||
| 33524 | // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment. | |||
| 33525 | SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl, | |||
| 33526 | Node->getMemoryVT(), | |||
| 33527 | Node->getOperand(0), | |||
| 33528 | Node->getOperand(1), Node->getOperand(2), | |||
| 33529 | Node->getMemOperand()); | |||
| 33530 | return Swap.getValue(1); | |||
| 33531 | } | |||
| 33532 | ||||
| 33533 | static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) { | |||
| 33534 | SDNode *N = Op.getNode(); | |||
| 33535 | MVT VT = N->getSimpleValueType(0); | |||
| 33536 | unsigned Opc = Op.getOpcode(); | |||
| 33537 | ||||
| 33538 | // Let legalize expand this if it isn't a legal type yet. | |||
| 33539 | if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) | |||
| 33540 | return SDValue(); | |||
| 33541 | ||||
| 33542 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 33543 | SDLoc DL(N); | |||
| 33544 | ||||
| 33545 | // Set the carry flag. | |||
| 33546 | SDValue Carry = Op.getOperand(2); | |||
| 33547 | EVT CarryVT = Carry.getValueType(); | |||
| 33548 | Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32), | |||
| 33549 | Carry, DAG.getAllOnesConstant(DL, CarryVT)); | |||
| 33550 | ||||
| 33551 | bool IsAdd = Opc == ISD::UADDO_CARRY || Opc == ISD::SADDO_CARRY; | |||
| 33552 | SDValue Sum = DAG.getNode(IsAdd ? X86ISD::ADC : X86ISD::SBB, DL, VTs, | |||
| 33553 | Op.getOperand(0), Op.getOperand(1), | |||
| 33554 | Carry.getValue(1)); | |||
| 33555 | ||||
| 33556 | bool IsSigned = Opc == ISD::SADDO_CARRY || Opc == ISD::SSUBO_CARRY; | |||
| 33557 | SDValue SetCC = getSETCC(IsSigned ? X86::COND_O : X86::COND_B, | |||
| 33558 | Sum.getValue(1), DL, DAG); | |||
| 33559 | if (N->getValueType(1) == MVT::i1) | |||
| 33560 | SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); | |||
| 33561 | ||||
| 33562 | return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC); | |||
| 33563 | } | |||
| 33564 | ||||
| 33565 | static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33566 | SelectionDAG &DAG) { | |||
| 33567 | assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit())(static_cast <bool> (Subtarget.isTargetDarwin() && Subtarget.is64Bit()) ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && Subtarget.is64Bit()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33567, __extension__ __PRETTY_FUNCTION__)); | |||
| 33568 | ||||
| 33569 | // For MacOSX, we want to call an alternative entry point: __sincos_stret, | |||
| 33570 | // which returns the values as { float, float } (in XMM0) or | |||
| 33571 | // { double, double } (which is returned in XMM0, XMM1). | |||
| 33572 | SDLoc dl(Op); | |||
| 33573 | SDValue Arg = Op.getOperand(0); | |||
| 33574 | EVT ArgVT = Arg.getValueType(); | |||
| 33575 | Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext()); | |||
| 33576 | ||||
| 33577 | TargetLowering::ArgListTy Args; | |||
| 33578 | TargetLowering::ArgListEntry Entry; | |||
| 33579 | ||||
| 33580 | Entry.Node = Arg; | |||
| 33581 | Entry.Ty = ArgTy; | |||
| 33582 | Entry.IsSExt = false; | |||
| 33583 | Entry.IsZExt = false; | |||
| 33584 | Args.push_back(Entry); | |||
| 33585 | ||||
| 33586 | bool isF64 = ArgVT == MVT::f64; | |||
| 33587 | // Only optimize x86_64 for now. i386 is a bit messy. For f32, | |||
| 33588 | // the small struct {f32, f32} is returned in (eax, edx). For f64, | |||
| 33589 | // the results are returned via SRet in memory. | |||
| 33590 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 33591 | RTLIB::Libcall LC = isF64 ? RTLIB::SINCOS_STRET_F64 : RTLIB::SINCOS_STRET_F32; | |||
| 33592 | const char *LibcallName = TLI.getLibcallName(LC); | |||
| 33593 | SDValue Callee = | |||
| 33594 | DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); | |||
| 33595 | ||||
| 33596 | Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) | |||
| 33597 | : (Type *)FixedVectorType::get(ArgTy, 4); | |||
| 33598 | ||||
| 33599 | TargetLowering::CallLoweringInfo CLI(DAG); | |||
| 33600 | CLI.setDebugLoc(dl) | |||
| 33601 | .setChain(DAG.getEntryNode()) | |||
| 33602 | .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args)); | |||
| 33603 | ||||
| 33604 | std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI); | |||
| 33605 | ||||
| 33606 | if (isF64) | |||
| 33607 | // Returned in xmm0 and xmm1. | |||
| 33608 | return CallResult.first; | |||
| 33609 | ||||
| 33610 | // Returned in bits 0:31 and 32:64 xmm0. | |||
| 33611 | SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, | |||
| 33612 | CallResult.first, DAG.getIntPtrConstant(0, dl)); | |||
| 33613 | SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT, | |||
| 33614 | CallResult.first, DAG.getIntPtrConstant(1, dl)); | |||
| 33615 | SDVTList Tys = DAG.getVTList(ArgVT, ArgVT); | |||
| 33616 | return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal); | |||
| 33617 | } | |||
| 33618 | ||||
| 33619 | /// Widen a vector input to a vector of NVT. The | |||
| 33620 | /// input vector must have the same element type as NVT. | |||
| 33621 | static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG, | |||
| 33622 | bool FillWithZeroes = false) { | |||
| 33623 | // Check if InOp already has the right width. | |||
| 33624 | MVT InVT = InOp.getSimpleValueType(); | |||
| 33625 | if (InVT == NVT) | |||
| 33626 | return InOp; | |||
| 33627 | ||||
| 33628 | if (InOp.isUndef()) | |||
| 33629 | return DAG.getUNDEF(NVT); | |||
| 33630 | ||||
| 33631 | assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&(static_cast <bool> (InVT.getVectorElementType() == NVT .getVectorElementType() && "input and widen element type must match" ) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__ __PRETTY_FUNCTION__)) | |||
| 33632 | "input and widen element type must match")(static_cast <bool> (InVT.getVectorElementType() == NVT .getVectorElementType() && "input and widen element type must match" ) ? void (0) : __assert_fail ("InVT.getVectorElementType() == NVT.getVectorElementType() && \"input and widen element type must match\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33632, __extension__ __PRETTY_FUNCTION__)); | |||
| 33633 | ||||
| 33634 | unsigned InNumElts = InVT.getVectorNumElements(); | |||
| 33635 | unsigned WidenNumElts = NVT.getVectorNumElements(); | |||
| 33636 | assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&(static_cast <bool> (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening" ) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__ __PRETTY_FUNCTION__)) | |||
| 33637 | "Unexpected request for vector widening")(static_cast <bool> (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && "Unexpected request for vector widening" ) ? void (0) : __assert_fail ("WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 && \"Unexpected request for vector widening\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33637, __extension__ __PRETTY_FUNCTION__)); | |||
| 33638 | ||||
| 33639 | SDLoc dl(InOp); | |||
| 33640 | if (InOp.getOpcode() == ISD::CONCAT_VECTORS && | |||
| 33641 | InOp.getNumOperands() == 2) { | |||
| 33642 | SDValue N1 = InOp.getOperand(1); | |||
| 33643 | if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) || | |||
| 33644 | N1.isUndef()) { | |||
| 33645 | InOp = InOp.getOperand(0); | |||
| 33646 | InVT = InOp.getSimpleValueType(); | |||
| 33647 | InNumElts = InVT.getVectorNumElements(); | |||
| 33648 | } | |||
| 33649 | } | |||
| 33650 | if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) || | |||
| 33651 | ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) { | |||
| 33652 | SmallVector<SDValue, 16> Ops; | |||
| 33653 | for (unsigned i = 0; i < InNumElts; ++i) | |||
| 33654 | Ops.push_back(InOp.getOperand(i)); | |||
| 33655 | ||||
| 33656 | EVT EltVT = InOp.getOperand(0).getValueType(); | |||
| 33657 | ||||
| 33658 | SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) : | |||
| 33659 | DAG.getUNDEF(EltVT); | |||
| 33660 | for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i) | |||
| 33661 | Ops.push_back(FillVal); | |||
| 33662 | return DAG.getBuildVector(NVT, dl, Ops); | |||
| 33663 | } | |||
| 33664 | SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) : | |||
| 33665 | DAG.getUNDEF(NVT); | |||
| 33666 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal, | |||
| 33667 | InOp, DAG.getIntPtrConstant(0, dl)); | |||
| 33668 | } | |||
| 33669 | ||||
| 33670 | static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33671 | SelectionDAG &DAG) { | |||
| 33672 | assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__ __PRETTY_FUNCTION__)) | |||
| 33673 | "MGATHER/MSCATTER are supported on AVX-512 arch only")(static_cast <bool> (Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"MGATHER/MSCATTER are supported on AVX-512 arch only\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33673, __extension__ __PRETTY_FUNCTION__)); | |||
| 33674 | ||||
| 33675 | MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); | |||
| 33676 | SDValue Src = N->getValue(); | |||
| 33677 | MVT VT = Src.getSimpleValueType(); | |||
| 33678 | assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported scatter op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33678, __extension__ __PRETTY_FUNCTION__)); | |||
| 33679 | SDLoc dl(Op); | |||
| 33680 | ||||
| 33681 | SDValue Scale = N->getScale(); | |||
| 33682 | SDValue Index = N->getIndex(); | |||
| 33683 | SDValue Mask = N->getMask(); | |||
| 33684 | SDValue Chain = N->getChain(); | |||
| 33685 | SDValue BasePtr = N->getBasePtr(); | |||
| 33686 | ||||
| 33687 | if (VT == MVT::v2f32 || VT == MVT::v2i32) { | |||
| 33688 | assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33688, __extension__ __PRETTY_FUNCTION__)); | |||
| 33689 | // If the index is v2i64 and we have VLX we can use xmm for data and index. | |||
| 33690 | if (Index.getValueType() == MVT::v2i64 && Subtarget.hasVLX()) { | |||
| 33691 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 33692 | EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); | |||
| 33693 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT)); | |||
| 33694 | SDVTList VTs = DAG.getVTList(MVT::Other); | |||
| 33695 | SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; | |||
| 33696 | return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, | |||
| 33697 | N->getMemoryVT(), N->getMemOperand()); | |||
| 33698 | } | |||
| 33699 | return SDValue(); | |||
| 33700 | } | |||
| 33701 | ||||
| 33702 | MVT IndexVT = Index.getSimpleValueType(); | |||
| 33703 | ||||
| 33704 | // If the index is v2i32, we're being called by type legalization and we | |||
| 33705 | // should just let the default handling take care of it. | |||
| 33706 | if (IndexVT == MVT::v2i32) | |||
| 33707 | return SDValue(); | |||
| 33708 | ||||
| 33709 | // If we don't have VLX and neither the passthru or index is 512-bits, we | |||
| 33710 | // need to widen until one is. | |||
| 33711 | if (!Subtarget.hasVLX() && !VT.is512BitVector() && | |||
| 33712 | !Index.getSimpleValueType().is512BitVector()) { | |||
| 33713 | // Determine how much we need to widen by to get a 512-bit type. | |||
| 33714 | unsigned Factor = std::min(512/VT.getSizeInBits(), | |||
| 33715 | 512/IndexVT.getSizeInBits()); | |||
| 33716 | unsigned NumElts = VT.getVectorNumElements() * Factor; | |||
| 33717 | ||||
| 33718 | VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); | |||
| 33719 | IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); | |||
| 33720 | MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); | |||
| 33721 | ||||
| 33722 | Src = ExtendToType(Src, VT, DAG); | |||
| 33723 | Index = ExtendToType(Index, IndexVT, DAG); | |||
| 33724 | Mask = ExtendToType(Mask, MaskVT, DAG, true); | |||
| 33725 | } | |||
| 33726 | ||||
| 33727 | SDVTList VTs = DAG.getVTList(MVT::Other); | |||
| 33728 | SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale}; | |||
| 33729 | return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops, | |||
| 33730 | N->getMemoryVT(), N->getMemOperand()); | |||
| 33731 | } | |||
| 33732 | ||||
| 33733 | static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33734 | SelectionDAG &DAG) { | |||
| 33735 | ||||
| 33736 | MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode()); | |||
| 33737 | MVT VT = Op.getSimpleValueType(); | |||
| 33738 | MVT ScalarVT = VT.getScalarType(); | |||
| 33739 | SDValue Mask = N->getMask(); | |||
| 33740 | MVT MaskVT = Mask.getSimpleValueType(); | |||
| 33741 | SDValue PassThru = N->getPassThru(); | |||
| 33742 | SDLoc dl(Op); | |||
| 33743 | ||||
| 33744 | // Handle AVX masked loads which don't support passthru other than 0. | |||
| 33745 | if (MaskVT.getVectorElementType() != MVT::i1) { | |||
| 33746 | // We also allow undef in the isel pattern. | |||
| 33747 | if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode())) | |||
| 33748 | return Op; | |||
| 33749 | ||||
| 33750 | SDValue NewLoad = DAG.getMaskedLoad( | |||
| 33751 | VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, | |||
| 33752 | getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(), | |||
| 33753 | N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(), | |||
| 33754 | N->isExpandingLoad()); | |||
| 33755 | // Emit a blend. | |||
| 33756 | SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru); | |||
| 33757 | return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl); | |||
| 33758 | } | |||
| 33759 | ||||
| 33760 | assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isExpandingLoad() || Subtarget .hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__ __PRETTY_FUNCTION__)) | |||
| 33761 | "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isExpandingLoad() || Subtarget .hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33761, __extension__ __PRETTY_FUNCTION__)); | |||
| 33762 | ||||
| 33763 | assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT .getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__ __PRETTY_FUNCTION__)) | |||
| 33764 | "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isExpandingLoad() || ScalarVT .getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? void (0) : __assert_fail ("(!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33764, __extension__ __PRETTY_FUNCTION__)); | |||
| 33765 | ||||
| 33766 | assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op.") ? void (0) : __assert_fail ( "Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__ __PRETTY_FUNCTION__)) | |||
| 33767 | "Cannot lower masked load op.")(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked load op.") ? void (0) : __assert_fail ( "Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33767, __extension__ __PRETTY_FUNCTION__)); | |||
| 33768 | ||||
| 33769 | assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__ __PRETTY_FUNCTION__)) | |||
| 33770 | (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__ __PRETTY_FUNCTION__)) | |||
| 33771 | (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__ __PRETTY_FUNCTION__)) | |||
| 33772 | "Unsupported masked load op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked load op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked load op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33772, __extension__ __PRETTY_FUNCTION__)); | |||
| 33773 | ||||
| 33774 | // This operation is legal for targets with VLX, but without | |||
| 33775 | // VLX the vector should be widened to 512 bit | |||
| 33776 | unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits(); | |||
| 33777 | MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); | |||
| 33778 | PassThru = ExtendToType(PassThru, WideDataVT, DAG); | |||
| 33779 | ||||
| 33780 | // Mask element has to be i1. | |||
| 33781 | assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType () == MVT::i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__ __PRETTY_FUNCTION__)) | |||
| 33782 | "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType () == MVT::i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33782, __extension__ __PRETTY_FUNCTION__)); | |||
| 33783 | ||||
| 33784 | MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); | |||
| 33785 | ||||
| 33786 | Mask = ExtendToType(Mask, WideMaskVT, DAG, true); | |||
| 33787 | SDValue NewLoad = DAG.getMaskedLoad( | |||
| 33788 | WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, | |||
| 33789 | PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(), | |||
| 33790 | N->getExtensionType(), N->isExpandingLoad()); | |||
| 33791 | ||||
| 33792 | SDValue Extract = | |||
| 33793 | DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0), | |||
| 33794 | DAG.getIntPtrConstant(0, dl)); | |||
| 33795 | SDValue RetOps[] = {Extract, NewLoad.getValue(1)}; | |||
| 33796 | return DAG.getMergeValues(RetOps, dl); | |||
| 33797 | } | |||
| 33798 | ||||
| 33799 | static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33800 | SelectionDAG &DAG) { | |||
| 33801 | MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode()); | |||
| 33802 | SDValue DataToStore = N->getValue(); | |||
| 33803 | MVT VT = DataToStore.getSimpleValueType(); | |||
| 33804 | MVT ScalarVT = VT.getScalarType(); | |||
| 33805 | SDValue Mask = N->getMask(); | |||
| 33806 | SDLoc dl(Op); | |||
| 33807 | ||||
| 33808 | assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&(static_cast <bool> ((!N->isCompressingStore() || Subtarget .hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__ __PRETTY_FUNCTION__)) | |||
| 33809 | "Expanding masked load is supported on AVX-512 target only!")(static_cast <bool> ((!N->isCompressingStore() || Subtarget .hasAVX512()) && "Expanding masked load is supported on AVX-512 target only!" ) ? void (0) : __assert_fail ("(!N->isCompressingStore() || Subtarget.hasAVX512()) && \"Expanding masked load is supported on AVX-512 target only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33809, __extension__ __PRETTY_FUNCTION__)); | |||
| 33810 | ||||
| 33811 | assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&(static_cast <bool> ((!N->isCompressingStore() || ScalarVT .getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__ __PRETTY_FUNCTION__)) | |||
| 33812 | "Expanding masked load is supported for 32 and 64-bit types only!")(static_cast <bool> ((!N->isCompressingStore() || ScalarVT .getSizeInBits() >= 32) && "Expanding masked load is supported for 32 and 64-bit types only!" ) ? void (0) : __assert_fail ("(!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) && \"Expanding masked load is supported for 32 and 64-bit types only!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33812, __extension__ __PRETTY_FUNCTION__)); | |||
| 33813 | ||||
| 33814 | assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op.") ? void (0) : __assert_fail ( "Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__ __PRETTY_FUNCTION__)) | |||
| 33815 | "Cannot lower masked store op.")(static_cast <bool> (Subtarget.hasAVX512() && ! Subtarget.hasVLX() && !VT.is512BitVector() && "Cannot lower masked store op.") ? void (0) : __assert_fail ( "Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && \"Cannot lower masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33815, __extension__ __PRETTY_FUNCTION__)); | |||
| 33816 | ||||
| 33817 | assert((ScalarVT.getSizeInBits() >= 32 ||(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__ __PRETTY_FUNCTION__)) | |||
| 33818 | (Subtarget.hasBWI() &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__ __PRETTY_FUNCTION__)) | |||
| 33819 | (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__ __PRETTY_FUNCTION__)) | |||
| 33820 | "Unsupported masked store op.")(static_cast <bool> ((ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && "Unsupported masked store op.") ? void (0) : __assert_fail ("(ScalarVT.getSizeInBits() >= 32 || (Subtarget.hasBWI() && (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) && \"Unsupported masked store op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33820, __extension__ __PRETTY_FUNCTION__)); | |||
| 33821 | ||||
| 33822 | // This operation is legal for targets with VLX, but without | |||
| 33823 | // VLX the vector should be widened to 512 bit | |||
| 33824 | unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits(); | |||
| 33825 | MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec); | |||
| 33826 | ||||
| 33827 | // Mask element has to be i1. | |||
| 33828 | assert(Mask.getSimpleValueType().getScalarType() == MVT::i1 &&(static_cast <bool> (Mask.getSimpleValueType().getScalarType () == MVT::i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__ __PRETTY_FUNCTION__)) | |||
| 33829 | "Unexpected mask type")(static_cast <bool> (Mask.getSimpleValueType().getScalarType () == MVT::i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getSimpleValueType().getScalarType() == MVT::i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33829, __extension__ __PRETTY_FUNCTION__)); | |||
| 33830 | ||||
| 33831 | MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec); | |||
| 33832 | ||||
| 33833 | DataToStore = ExtendToType(DataToStore, WideDataVT, DAG); | |||
| 33834 | Mask = ExtendToType(Mask, WideMaskVT, DAG, true); | |||
| 33835 | return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(), | |||
| 33836 | N->getOffset(), Mask, N->getMemoryVT(), | |||
| 33837 | N->getMemOperand(), N->getAddressingMode(), | |||
| 33838 | N->isTruncatingStore(), N->isCompressingStore()); | |||
| 33839 | } | |||
| 33840 | ||||
| 33841 | static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, | |||
| 33842 | SelectionDAG &DAG) { | |||
| 33843 | assert(Subtarget.hasAVX2() &&(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__ __PRETTY_FUNCTION__)) | |||
| 33844 | "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only")(static_cast <bool> (Subtarget.hasAVX2() && "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33844, __extension__ __PRETTY_FUNCTION__)); | |||
| 33845 | ||||
| 33846 | MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); | |||
| 33847 | SDLoc dl(Op); | |||
| 33848 | MVT VT = Op.getSimpleValueType(); | |||
| 33849 | SDValue Index = N->getIndex(); | |||
| 33850 | SDValue Mask = N->getMask(); | |||
| 33851 | SDValue PassThru = N->getPassThru(); | |||
| 33852 | MVT IndexVT = Index.getSimpleValueType(); | |||
| 33853 | ||||
| 33854 | assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op")(static_cast <bool> (VT.getScalarSizeInBits() >= 32 && "Unsupported gather op") ? void (0) : __assert_fail ("VT.getScalarSizeInBits() >= 32 && \"Unsupported gather op\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33854, __extension__ __PRETTY_FUNCTION__)); | |||
| 33855 | ||||
| 33856 | // If the index is v2i32, we're being called by type legalization. | |||
| 33857 | if (IndexVT == MVT::v2i32) | |||
| 33858 | return SDValue(); | |||
| 33859 | ||||
| 33860 | // If we don't have VLX and neither the passthru or index is 512-bits, we | |||
| 33861 | // need to widen until one is. | |||
| 33862 | MVT OrigVT = VT; | |||
| 33863 | if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && | |||
| 33864 | !IndexVT.is512BitVector()) { | |||
| 33865 | // Determine how much we need to widen by to get a 512-bit type. | |||
| 33866 | unsigned Factor = std::min(512/VT.getSizeInBits(), | |||
| 33867 | 512/IndexVT.getSizeInBits()); | |||
| 33868 | ||||
| 33869 | unsigned NumElts = VT.getVectorNumElements() * Factor; | |||
| 33870 | ||||
| 33871 | VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); | |||
| 33872 | IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts); | |||
| 33873 | MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts); | |||
| 33874 | ||||
| 33875 | PassThru = ExtendToType(PassThru, VT, DAG); | |||
| 33876 | Index = ExtendToType(Index, IndexVT, DAG); | |||
| 33877 | Mask = ExtendToType(Mask, MaskVT, DAG, true); | |||
| 33878 | } | |||
| 33879 | ||||
| 33880 | // Break dependency on the data register. | |||
| 33881 | if (PassThru.isUndef()) | |||
| 33882 | PassThru = getZeroVector(VT, Subtarget, DAG, dl); | |||
| 33883 | ||||
| 33884 | SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index, | |||
| 33885 | N->getScale() }; | |||
| 33886 | SDValue NewGather = DAG.getMemIntrinsicNode( | |||
| 33887 | X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(), | |||
| 33888 | N->getMemOperand()); | |||
| 33889 | SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT, | |||
| 33890 | NewGather, DAG.getIntPtrConstant(0, dl)); | |||
| 33891 | return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl); | |||
| 33892 | } | |||
| 33893 | ||||
| 33894 | static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) { | |||
| 33895 | SDLoc dl(Op); | |||
| 33896 | SDValue Src = Op.getOperand(0); | |||
| 33897 | MVT DstVT = Op.getSimpleValueType(); | |||
| 33898 | ||||
| 33899 | AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode()); | |||
| 33900 | unsigned SrcAS = N->getSrcAddressSpace(); | |||
| 33901 | ||||
| 33902 | assert(SrcAS != N->getDestAddressSpace() &&(static_cast <bool> (SrcAS != N->getDestAddressSpace () && "addrspacecast must be between different address spaces" ) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__ __PRETTY_FUNCTION__)) | |||
| 33903 | "addrspacecast must be between different address spaces")(static_cast <bool> (SrcAS != N->getDestAddressSpace () && "addrspacecast must be between different address spaces" ) ? void (0) : __assert_fail ("SrcAS != N->getDestAddressSpace() && \"addrspacecast must be between different address spaces\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33903, __extension__ __PRETTY_FUNCTION__)); | |||
| 33904 | ||||
| 33905 | if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) { | |||
| 33906 | Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src); | |||
| 33907 | } else if (DstVT == MVT::i64) { | |||
| 33908 | Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src); | |||
| 33909 | } else if (DstVT == MVT::i32) { | |||
| 33910 | Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src); | |||
| 33911 | } else { | |||
| 33912 | report_fatal_error("Bad address space in addrspacecast"); | |||
| 33913 | } | |||
| 33914 | return Op; | |||
| 33915 | } | |||
| 33916 | ||||
| 33917 | SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op, | |||
| 33918 | SelectionDAG &DAG) const { | |||
| 33919 | // TODO: Eventually, the lowering of these nodes should be informed by or | |||
| 33920 | // deferred to the GC strategy for the function in which they appear. For | |||
| 33921 | // now, however, they must be lowered to something. Since they are logically | |||
| 33922 | // no-ops in the case of a null GC strategy (or a GC strategy which does not | |||
| 33923 | // require special handling for these nodes), lower them as literal NOOPs for | |||
| 33924 | // the time being. | |||
| 33925 | SmallVector<SDValue, 2> Ops; | |||
| 33926 | Ops.push_back(Op.getOperand(0)); | |||
| 33927 | if (Op->getGluedNode()) | |||
| 33928 | Ops.push_back(Op->getOperand(Op->getNumOperands() - 1)); | |||
| 33929 | ||||
| 33930 | SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 33931 | return SDValue(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0); | |||
| 33932 | } | |||
| 33933 | ||||
| 33934 | // Custom split CVTPS2PH with wide types. | |||
| 33935 | static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) { | |||
| 33936 | SDLoc dl(Op); | |||
| 33937 | EVT VT = Op.getValueType(); | |||
| 33938 | SDValue Lo, Hi; | |||
| 33939 | std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0); | |||
| 33940 | EVT LoVT, HiVT; | |||
| 33941 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 33942 | SDValue RC = Op.getOperand(1); | |||
| 33943 | Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC); | |||
| 33944 | Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC); | |||
| 33945 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 33946 | } | |||
| 33947 | ||||
| 33948 | static StringRef getInstrStrFromOpNo(const SmallVectorImpl<StringRef> &AsmStrs, | |||
| 33949 | unsigned OpNo) { | |||
| 33950 | const APInt Operand(32, OpNo); | |||
| 33951 | std::string OpNoStr = llvm::toString(Operand, 10, false); | |||
| 33952 | std::string Str(" $"); | |||
| 33953 | ||||
| 33954 | std::string OpNoStr1(Str + OpNoStr); // e.g. " $1" (OpNo=1) | |||
| 33955 | std::string OpNoStr2(Str + "{" + OpNoStr + ":"); // With modifier, e.g. ${1:P} | |||
| 33956 | ||||
| 33957 | auto I = StringRef::npos; | |||
| 33958 | for (auto &AsmStr : AsmStrs) { | |||
| 33959 | // Match the OpNo string. We should match exactly to exclude match | |||
| 33960 | // sub-string, e.g. "$12" contain "$1" | |||
| 33961 | if (AsmStr.endswith(OpNoStr1)) | |||
| 33962 | I = AsmStr.size() - OpNoStr1.size(); | |||
| 33963 | ||||
| 33964 | // Get the index of operand in AsmStr. | |||
| 33965 | if (I == StringRef::npos) | |||
| 33966 | I = AsmStr.find(OpNoStr1 + ","); | |||
| 33967 | if (I == StringRef::npos) | |||
| 33968 | I = AsmStr.find(OpNoStr2); | |||
| 33969 | ||||
| 33970 | if (I == StringRef::npos) | |||
| 33971 | continue; | |||
| 33972 | ||||
| 33973 | assert(I > 0 && "Unexpected inline asm string!")(static_cast <bool> (I > 0 && "Unexpected inline asm string!" ) ? void (0) : __assert_fail ("I > 0 && \"Unexpected inline asm string!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 33973, __extension__ __PRETTY_FUNCTION__)); | |||
| 33974 | // Remove the operand string and label (if exsit). | |||
| 33975 | // For example: | |||
| 33976 | // ".L__MSASMLABEL_.${:uid}__l:call dword ptr ${0:P}" | |||
| 33977 | // ==> | |||
| 33978 | // ".L__MSASMLABEL_.${:uid}__l:call dword ptr " | |||
| 33979 | // ==> | |||
| 33980 | // "call dword ptr " | |||
| 33981 | auto TmpStr = AsmStr.substr(0, I); | |||
| 33982 | I = TmpStr.rfind(':'); | |||
| 33983 | if (I != StringRef::npos) | |||
| 33984 | TmpStr = TmpStr.substr(I + 1); | |||
| 33985 | return TmpStr.take_while(llvm::isAlpha); | |||
| 33986 | } | |||
| 33987 | ||||
| 33988 | return StringRef(); | |||
| 33989 | } | |||
| 33990 | ||||
| 33991 | bool X86TargetLowering::isInlineAsmTargetBranch( | |||
| 33992 | const SmallVectorImpl<StringRef> &AsmStrs, unsigned OpNo) const { | |||
| 33993 | // In a __asm block, __asm inst foo where inst is CALL or JMP should be | |||
| 33994 | // changed from indirect TargetLowering::C_Memory to direct | |||
| 33995 | // TargetLowering::C_Address. | |||
| 33996 | // We don't need to special case LOOP* and Jcc, which cannot target a memory | |||
| 33997 | // location. | |||
| 33998 | StringRef Inst = getInstrStrFromOpNo(AsmStrs, OpNo); | |||
| 33999 | return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp"); | |||
| 34000 | } | |||
| 34001 | ||||
| 34002 | /// Provide custom lowering hooks for some operations. | |||
| 34003 | SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { | |||
| 34004 | switch (Op.getOpcode()) { | |||
| 34005 | default: llvm_unreachable("Should not custom lower this!")::llvm::llvm_unreachable_internal("Should not custom lower this!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34005); | |||
| 34006 | case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG); | |||
| 34007 | case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: | |||
| 34008 | return LowerCMP_SWAP(Op, Subtarget, DAG); | |||
| 34009 | case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG); | |||
| 34010 | case ISD::ATOMIC_LOAD_ADD: | |||
| 34011 | case ISD::ATOMIC_LOAD_SUB: | |||
| 34012 | case ISD::ATOMIC_LOAD_OR: | |||
| 34013 | case ISD::ATOMIC_LOAD_XOR: | |||
| 34014 | case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget); | |||
| 34015 | case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG, Subtarget); | |||
| 34016 | case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG); | |||
| 34017 | case ISD::PARITY: return LowerPARITY(Op, Subtarget, DAG); | |||
| 34018 | case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG); | |||
| 34019 | case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG); | |||
| 34020 | case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG); | |||
| 34021 | case ISD::VSELECT: return LowerVSELECT(Op, DAG); | |||
| 34022 | case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); | |||
| 34023 | case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); | |||
| 34024 | case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); | |||
| 34025 | case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); | |||
| 34026 | case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); | |||
| 34027 | case ISD::ConstantPool: return LowerConstantPool(Op, DAG); | |||
| 34028 | case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG); | |||
| 34029 | case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG); | |||
| 34030 | case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG); | |||
| 34031 | case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); | |||
| 34032 | case ISD::SHL_PARTS: | |||
| 34033 | case ISD::SRA_PARTS: | |||
| 34034 | case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG); | |||
| 34035 | case ISD::FSHL: | |||
| 34036 | case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG); | |||
| 34037 | case ISD::STRICT_SINT_TO_FP: | |||
| 34038 | case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); | |||
| 34039 | case ISD::STRICT_UINT_TO_FP: | |||
| 34040 | case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); | |||
| 34041 | case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG); | |||
| 34042 | case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG); | |||
| 34043 | case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG); | |||
| 34044 | case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG); | |||
| 34045 | case ISD::ZERO_EXTEND_VECTOR_INREG: | |||
| 34046 | case ISD::SIGN_EXTEND_VECTOR_INREG: | |||
| 34047 | return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG); | |||
| 34048 | case ISD::FP_TO_SINT: | |||
| 34049 | case ISD::STRICT_FP_TO_SINT: | |||
| 34050 | case ISD::FP_TO_UINT: | |||
| 34051 | case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG); | |||
| 34052 | case ISD::FP_TO_SINT_SAT: | |||
| 34053 | case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG); | |||
| 34054 | case ISD::FP_EXTEND: | |||
| 34055 | case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); | |||
| 34056 | case ISD::FP_ROUND: | |||
| 34057 | case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); | |||
| 34058 | case ISD::FP16_TO_FP: | |||
| 34059 | case ISD::STRICT_FP16_TO_FP: return LowerFP16_TO_FP(Op, DAG); | |||
| 34060 | case ISD::FP_TO_FP16: | |||
| 34061 | case ISD::STRICT_FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG); | |||
| 34062 | case ISD::FP_TO_BF16: return LowerFP_TO_BF16(Op, DAG); | |||
| 34063 | case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG); | |||
| 34064 | case ISD::STORE: return LowerStore(Op, Subtarget, DAG); | |||
| 34065 | case ISD::FADD: | |||
| 34066 | case ISD::FSUB: return lowerFaddFsub(Op, DAG); | |||
| 34067 | case ISD::FROUND: return LowerFROUND(Op, DAG); | |||
| 34068 | case ISD::FABS: | |||
| 34069 | case ISD::FNEG: return LowerFABSorFNEG(Op, DAG); | |||
| 34070 | case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG); | |||
| 34071 | case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG); | |||
| 34072 | case ISD::LRINT: | |||
| 34073 | case ISD::LLRINT: return LowerLRINT_LLRINT(Op, DAG); | |||
| 34074 | case ISD::SETCC: | |||
| 34075 | case ISD::STRICT_FSETCC: | |||
| 34076 | case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG); | |||
| 34077 | case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG); | |||
| 34078 | case ISD::SELECT: return LowerSELECT(Op, DAG); | |||
| 34079 | case ISD::BRCOND: return LowerBRCOND(Op, DAG); | |||
| 34080 | case ISD::JumpTable: return LowerJumpTable(Op, DAG); | |||
| 34081 | case ISD::VASTART: return LowerVASTART(Op, DAG); | |||
| 34082 | case ISD::VAARG: return LowerVAARG(Op, DAG); | |||
| 34083 | case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); | |||
| 34084 | case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); | |||
| 34085 | case ISD::INTRINSIC_VOID: | |||
| 34086 | case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); | |||
| 34087 | case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); | |||
| 34088 | case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG); | |||
| 34089 | case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); | |||
| 34090 | case ISD::FRAME_TO_ARGS_OFFSET: | |||
| 34091 | return LowerFRAME_TO_ARGS_OFFSET(Op, DAG); | |||
| 34092 | case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); | |||
| 34093 | case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG); | |||
| 34094 | case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG); | |||
| 34095 | case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG); | |||
| 34096 | case ISD::EH_SJLJ_SETUP_DISPATCH: | |||
| 34097 | return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG); | |||
| 34098 | case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG); | |||
| 34099 | case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG); | |||
| 34100 | case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG); | |||
| 34101 | case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG); | |||
| 34102 | case ISD::CTLZ: | |||
| 34103 | case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG); | |||
| 34104 | case ISD::CTTZ: | |||
| 34105 | case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, Subtarget, DAG); | |||
| 34106 | case ISD::MUL: return LowerMUL(Op, Subtarget, DAG); | |||
| 34107 | case ISD::MULHS: | |||
| 34108 | case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); | |||
| 34109 | case ISD::ROTL: | |||
| 34110 | case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); | |||
| 34111 | case ISD::SRA: | |||
| 34112 | case ISD::SRL: | |||
| 34113 | case ISD::SHL: return LowerShift(Op, Subtarget, DAG); | |||
| 34114 | case ISD::SADDO: | |||
| 34115 | case ISD::UADDO: | |||
| 34116 | case ISD::SSUBO: | |||
| 34117 | case ISD::USUBO: return LowerXALUO(Op, DAG); | |||
| 34118 | case ISD::SMULO: | |||
| 34119 | case ISD::UMULO: return LowerMULO(Op, Subtarget, DAG); | |||
| 34120 | case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG); | |||
| 34121 | case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG); | |||
| 34122 | case ISD::SADDO_CARRY: | |||
| 34123 | case ISD::SSUBO_CARRY: | |||
| 34124 | case ISD::UADDO_CARRY: | |||
| 34125 | case ISD::USUBO_CARRY: return LowerADDSUBO_CARRY(Op, DAG); | |||
| 34126 | case ISD::ADD: | |||
| 34127 | case ISD::SUB: return lowerAddSub(Op, DAG, Subtarget); | |||
| 34128 | case ISD::UADDSAT: | |||
| 34129 | case ISD::SADDSAT: | |||
| 34130 | case ISD::USUBSAT: | |||
| 34131 | case ISD::SSUBSAT: return LowerADDSAT_SUBSAT(Op, DAG, Subtarget); | |||
| 34132 | case ISD::SMAX: | |||
| 34133 | case ISD::SMIN: | |||
| 34134 | case ISD::UMAX: | |||
| 34135 | case ISD::UMIN: return LowerMINMAX(Op, Subtarget, DAG); | |||
| 34136 | case ISD::FMINIMUM: | |||
| 34137 | case ISD::FMAXIMUM: | |||
| 34138 | return LowerFMINIMUM_FMAXIMUM(Op, Subtarget, DAG); | |||
| 34139 | case ISD::ABS: return LowerABS(Op, Subtarget, DAG); | |||
| 34140 | case ISD::ABDS: | |||
| 34141 | case ISD::ABDU: return LowerABD(Op, Subtarget, DAG); | |||
| 34142 | case ISD::AVGCEILU: return LowerAVG(Op, Subtarget, DAG); | |||
| 34143 | case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG); | |||
| 34144 | case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG); | |||
| 34145 | case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG); | |||
| 34146 | case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG); | |||
| 34147 | case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG); | |||
| 34148 | case ISD::GC_TRANSITION_START: | |||
| 34149 | case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG); | |||
| 34150 | case ISD::ADDRSPACECAST: return LowerADDRSPACECAST(Op, DAG); | |||
| 34151 | case X86ISD::CVTPS2PH: return LowerCVTPS2PH(Op, DAG); | |||
| 34152 | } | |||
| 34153 | } | |||
| 34154 | ||||
| 34155 | /// Replace a node with an illegal result type with a new node built out of | |||
| 34156 | /// custom code. | |||
| 34157 | void X86TargetLowering::ReplaceNodeResults(SDNode *N, | |||
| 34158 | SmallVectorImpl<SDValue>&Results, | |||
| 34159 | SelectionDAG &DAG) const { | |||
| 34160 | SDLoc dl(N); | |||
| 34161 | switch (N->getOpcode()) { | |||
| 34162 | default: | |||
| 34163 | #ifndef NDEBUG | |||
| 34164 | dbgs() << "ReplaceNodeResults: "; | |||
| 34165 | N->dump(&DAG); | |||
| 34166 | #endif | |||
| 34167 | llvm_unreachable("Do not know how to custom type legalize this operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type legalize this operation!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34167); | |||
| 34168 | case X86ISD::CVTPH2PS: { | |||
| 34169 | EVT VT = N->getValueType(0); | |||
| 34170 | SDValue Lo, Hi; | |||
| 34171 | std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); | |||
| 34172 | EVT LoVT, HiVT; | |||
| 34173 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 34174 | Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo); | |||
| 34175 | Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi); | |||
| 34176 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 34177 | Results.push_back(Res); | |||
| 34178 | return; | |||
| 34179 | } | |||
| 34180 | case X86ISD::STRICT_CVTPH2PS: { | |||
| 34181 | EVT VT = N->getValueType(0); | |||
| 34182 | SDValue Lo, Hi; | |||
| 34183 | std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1); | |||
| 34184 | EVT LoVT, HiVT; | |||
| 34185 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT); | |||
| 34186 | Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other}, | |||
| 34187 | {N->getOperand(0), Lo}); | |||
| 34188 | Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other}, | |||
| 34189 | {N->getOperand(0), Hi}); | |||
| 34190 | SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, | |||
| 34191 | Lo.getValue(1), Hi.getValue(1)); | |||
| 34192 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 34193 | Results.push_back(Res); | |||
| 34194 | Results.push_back(Chain); | |||
| 34195 | return; | |||
| 34196 | } | |||
| 34197 | case X86ISD::CVTPS2PH: | |||
| 34198 | Results.push_back(LowerCVTPS2PH(SDValue(N, 0), DAG)); | |||
| 34199 | return; | |||
| 34200 | case ISD::CTPOP: { | |||
| 34201 | assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 && "Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34201, __extension__ __PRETTY_FUNCTION__)); | |||
| 34202 | // Use a v2i64 if possible. | |||
| 34203 | bool NoImplicitFloatOps = | |||
| 34204 | DAG.getMachineFunction().getFunction().hasFnAttribute( | |||
| 34205 | Attribute::NoImplicitFloat); | |||
| 34206 | if (isTypeLegal(MVT::v2i64) && !NoImplicitFloatOps) { | |||
| 34207 | SDValue Wide = | |||
| 34208 | DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, N->getOperand(0)); | |||
| 34209 | Wide = DAG.getNode(ISD::CTPOP, dl, MVT::v2i64, Wide); | |||
| 34210 | // Bit count should fit in 32-bits, extract it as that and then zero | |||
| 34211 | // extend to i64. Otherwise we end up extracting bits 63:32 separately. | |||
| 34212 | Wide = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Wide); | |||
| 34213 | Wide = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Wide, | |||
| 34214 | DAG.getIntPtrConstant(0, dl)); | |||
| 34215 | Wide = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Wide); | |||
| 34216 | Results.push_back(Wide); | |||
| 34217 | } | |||
| 34218 | return; | |||
| 34219 | } | |||
| 34220 | case ISD::MUL: { | |||
| 34221 | EVT VT = N->getValueType(0); | |||
| 34222 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && VT.getVectorElementType() == MVT ::i8 && "Unexpected VT!") ? void (0) : __assert_fail ( "getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__ __PRETTY_FUNCTION__)) | |||
| 34223 | VT.getVectorElementType() == MVT::i8 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && VT.getVectorElementType() == MVT ::i8 && "Unexpected VT!") ? void (0) : __assert_fail ( "getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT.getVectorElementType() == MVT::i8 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34223, __extension__ __PRETTY_FUNCTION__)); | |||
| 34224 | // Pre-promote these to vXi16 to avoid op legalization thinking all 16 | |||
| 34225 | // elements are needed. | |||
| 34226 | MVT MulVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements()); | |||
| 34227 | SDValue Op0 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(0)); | |||
| 34228 | SDValue Op1 = DAG.getNode(ISD::ANY_EXTEND, dl, MulVT, N->getOperand(1)); | |||
| 34229 | SDValue Res = DAG.getNode(ISD::MUL, dl, MulVT, Op0, Op1); | |||
| 34230 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 34231 | unsigned NumConcats = 16 / VT.getVectorNumElements(); | |||
| 34232 | SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); | |||
| 34233 | ConcatOps[0] = Res; | |||
| 34234 | Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i8, ConcatOps); | |||
| 34235 | Results.push_back(Res); | |||
| 34236 | return; | |||
| 34237 | } | |||
| 34238 | case ISD::SMULO: | |||
| 34239 | case ISD::UMULO: { | |||
| 34240 | EVT VT = N->getValueType(0); | |||
| 34241 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!" ) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__ __PRETTY_FUNCTION__)) | |||
| 34242 | VT == MVT::v2i32 && "Unexpected VT!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && VT == MVT::v2i32 && "Unexpected VT!" ) ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && VT == MVT::v2i32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34242, __extension__ __PRETTY_FUNCTION__)); | |||
| 34243 | bool IsSigned = N->getOpcode() == ISD::SMULO; | |||
| 34244 | unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 34245 | SDValue Op0 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(0)); | |||
| 34246 | SDValue Op1 = DAG.getNode(ExtOpc, dl, MVT::v2i64, N->getOperand(1)); | |||
| 34247 | SDValue Res = DAG.getNode(ISD::MUL, dl, MVT::v2i64, Op0, Op1); | |||
| 34248 | // Extract the high 32 bits from each result using PSHUFD. | |||
| 34249 | // TODO: Could use SRL+TRUNCATE but that doesn't become a PSHUFD. | |||
| 34250 | SDValue Hi = DAG.getBitcast(MVT::v4i32, Res); | |||
| 34251 | Hi = DAG.getVectorShuffle(MVT::v4i32, dl, Hi, Hi, {1, 3, -1, -1}); | |||
| 34252 | Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Hi, | |||
| 34253 | DAG.getIntPtrConstant(0, dl)); | |||
| 34254 | ||||
| 34255 | // Truncate the low bits of the result. This will become PSHUFD. | |||
| 34256 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 34257 | ||||
| 34258 | SDValue HiCmp; | |||
| 34259 | if (IsSigned) { | |||
| 34260 | // SMULO overflows if the high bits don't match the sign of the low. | |||
| 34261 | HiCmp = DAG.getNode(ISD::SRA, dl, VT, Res, DAG.getConstant(31, dl, VT)); | |||
| 34262 | } else { | |||
| 34263 | // UMULO overflows if the high bits are non-zero. | |||
| 34264 | HiCmp = DAG.getConstant(0, dl, VT); | |||
| 34265 | } | |||
| 34266 | SDValue Ovf = DAG.getSetCC(dl, N->getValueType(1), Hi, HiCmp, ISD::SETNE); | |||
| 34267 | ||||
| 34268 | // Widen the result with by padding with undef. | |||
| 34269 | Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, | |||
| 34270 | DAG.getUNDEF(VT)); | |||
| 34271 | Results.push_back(Res); | |||
| 34272 | Results.push_back(Ovf); | |||
| 34273 | return; | |||
| 34274 | } | |||
| 34275 | case X86ISD::VPMADDWD: { | |||
| 34276 | // Legalize types for X86ISD::VPMADDWD by widening. | |||
| 34277 | assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34277, __extension__ __PRETTY_FUNCTION__)); | |||
| 34278 | ||||
| 34279 | EVT VT = N->getValueType(0); | |||
| 34280 | EVT InVT = N->getOperand(0).getValueType(); | |||
| 34281 | assert(VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 &&(static_cast <bool> (VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits." ) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__ __PRETTY_FUNCTION__)) | |||
| 34282 | "Expected a VT that divides into 128 bits.")(static_cast <bool> (VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && "Expected a VT that divides into 128 bits." ) ? void (0) : __assert_fail ("VT.getSizeInBits() < 128 && 128 % VT.getSizeInBits() == 0 && \"Expected a VT that divides into 128 bits.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34282, __extension__ __PRETTY_FUNCTION__)); | |||
| 34283 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__ __PRETTY_FUNCTION__)) | |||
| 34284 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34284, __extension__ __PRETTY_FUNCTION__)); | |||
| 34285 | unsigned NumConcat = 128 / InVT.getSizeInBits(); | |||
| 34286 | ||||
| 34287 | EVT InWideVT = EVT::getVectorVT(*DAG.getContext(), | |||
| 34288 | InVT.getVectorElementType(), | |||
| 34289 | NumConcat * InVT.getVectorNumElements()); | |||
| 34290 | EVT WideVT = EVT::getVectorVT(*DAG.getContext(), | |||
| 34291 | VT.getVectorElementType(), | |||
| 34292 | NumConcat * VT.getVectorNumElements()); | |||
| 34293 | ||||
| 34294 | SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT)); | |||
| 34295 | Ops[0] = N->getOperand(0); | |||
| 34296 | SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); | |||
| 34297 | Ops[0] = N->getOperand(1); | |||
| 34298 | SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWideVT, Ops); | |||
| 34299 | ||||
| 34300 | SDValue Res = DAG.getNode(N->getOpcode(), dl, WideVT, InVec0, InVec1); | |||
| 34301 | Results.push_back(Res); | |||
| 34302 | return; | |||
| 34303 | } | |||
| 34304 | // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. | |||
| 34305 | case X86ISD::FMINC: | |||
| 34306 | case X86ISD::FMIN: | |||
| 34307 | case X86ISD::FMAXC: | |||
| 34308 | case X86ISD::FMAX: { | |||
| 34309 | EVT VT = N->getValueType(0); | |||
| 34310 | assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.")(static_cast <bool> (VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX." ) ? void (0) : __assert_fail ("VT == MVT::v2f32 && \"Unexpected type (!= v2f32) on FMIN/FMAX.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34310, __extension__ __PRETTY_FUNCTION__)); | |||
| 34311 | SDValue UNDEF = DAG.getUNDEF(VT); | |||
| 34312 | SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, | |||
| 34313 | N->getOperand(0), UNDEF); | |||
| 34314 | SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, | |||
| 34315 | N->getOperand(1), UNDEF); | |||
| 34316 | Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS)); | |||
| 34317 | return; | |||
| 34318 | } | |||
| 34319 | case ISD::SDIV: | |||
| 34320 | case ISD::UDIV: | |||
| 34321 | case ISD::SREM: | |||
| 34322 | case ISD::UREM: { | |||
| 34323 | EVT VT = N->getValueType(0); | |||
| 34324 | if (VT.isVector()) { | |||
| 34325 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__ __PRETTY_FUNCTION__)) | |||
| 34326 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34326, __extension__ __PRETTY_FUNCTION__)); | |||
| 34327 | // If this RHS is a constant splat vector we can widen this and let | |||
| 34328 | // division/remainder by constant optimize it. | |||
| 34329 | // TODO: Can we do something for non-splat? | |||
| 34330 | APInt SplatVal; | |||
| 34331 | if (ISD::isConstantSplatVector(N->getOperand(1).getNode(), SplatVal)) { | |||
| 34332 | unsigned NumConcats = 128 / VT.getSizeInBits(); | |||
| 34333 | SmallVector<SDValue, 8> Ops0(NumConcats, DAG.getUNDEF(VT)); | |||
| 34334 | Ops0[0] = N->getOperand(0); | |||
| 34335 | EVT ResVT = getTypeToTransformTo(*DAG.getContext(), VT); | |||
| 34336 | SDValue N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Ops0); | |||
| 34337 | SDValue N1 = DAG.getConstant(SplatVal, dl, ResVT); | |||
| 34338 | SDValue Res = DAG.getNode(N->getOpcode(), dl, ResVT, N0, N1); | |||
| 34339 | Results.push_back(Res); | |||
| 34340 | } | |||
| 34341 | return; | |||
| 34342 | } | |||
| 34343 | ||||
| 34344 | SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG); | |||
| 34345 | Results.push_back(V); | |||
| 34346 | return; | |||
| 34347 | } | |||
| 34348 | case ISD::TRUNCATE: { | |||
| 34349 | MVT VT = N->getSimpleValueType(0); | |||
| 34350 | if (getTypeAction(*DAG.getContext(), VT) != TypeWidenVector) | |||
| 34351 | return; | |||
| 34352 | ||||
| 34353 | // The generic legalizer will try to widen the input type to the same | |||
| 34354 | // number of elements as the widened result type. But this isn't always | |||
| 34355 | // the best thing so do some custom legalization to avoid some cases. | |||
| 34356 | MVT WidenVT = getTypeToTransformTo(*DAG.getContext(), VT).getSimpleVT(); | |||
| 34357 | SDValue In = N->getOperand(0); | |||
| 34358 | EVT InVT = In.getValueType(); | |||
| 34359 | ||||
| 34360 | unsigned InBits = InVT.getSizeInBits(); | |||
| 34361 | if (128 % InBits == 0) { | |||
| 34362 | // 128 bit and smaller inputs should avoid truncate all together and | |||
| 34363 | // just use a build_vector that will become a shuffle. | |||
| 34364 | // TODO: Widen and use a shuffle directly? | |||
| 34365 | MVT InEltVT = InVT.getSimpleVT().getVectorElementType(); | |||
| 34366 | EVT EltVT = VT.getVectorElementType(); | |||
| 34367 | unsigned WidenNumElts = WidenVT.getVectorNumElements(); | |||
| 34368 | SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT)); | |||
| 34369 | // Use the original element count so we don't do more scalar opts than | |||
| 34370 | // necessary. | |||
| 34371 | unsigned MinElts = VT.getVectorNumElements(); | |||
| 34372 | for (unsigned i=0; i < MinElts; ++i) { | |||
| 34373 | SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, In, | |||
| 34374 | DAG.getIntPtrConstant(i, dl)); | |||
| 34375 | Ops[i] = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Val); | |||
| 34376 | } | |||
| 34377 | Results.push_back(DAG.getBuildVector(WidenVT, dl, Ops)); | |||
| 34378 | return; | |||
| 34379 | } | |||
| 34380 | // With AVX512 there are some cases that can use a target specific | |||
| 34381 | // truncate node to go from 256/512 to less than 128 with zeros in the | |||
| 34382 | // upper elements of the 128 bit result. | |||
| 34383 | if (Subtarget.hasAVX512() && isTypeLegal(InVT)) { | |||
| 34384 | // We can use VTRUNC directly if for 256 bits with VLX or for any 512. | |||
| 34385 | if ((InBits == 256 && Subtarget.hasVLX()) || InBits == 512) { | |||
| 34386 | Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); | |||
| 34387 | return; | |||
| 34388 | } | |||
| 34389 | // There's one case we can widen to 512 bits and use VTRUNC. | |||
| 34390 | if (InVT == MVT::v4i64 && VT == MVT::v4i8 && isTypeLegal(MVT::v8i64)) { | |||
| 34391 | In = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i64, In, | |||
| 34392 | DAG.getUNDEF(MVT::v4i64)); | |||
| 34393 | Results.push_back(DAG.getNode(X86ISD::VTRUNC, dl, WidenVT, In)); | |||
| 34394 | return; | |||
| 34395 | } | |||
| 34396 | } | |||
| 34397 | if (Subtarget.hasVLX() && InVT == MVT::v8i64 && VT == MVT::v8i8 && | |||
| 34398 | getTypeAction(*DAG.getContext(), InVT) == TypeSplitVector && | |||
| 34399 | isTypeLegal(MVT::v4i64)) { | |||
| 34400 | // Input needs to be split and output needs to widened. Let's use two | |||
| 34401 | // VTRUNCs, and shuffle their results together into the wider type. | |||
| 34402 | SDValue Lo, Hi; | |||
| 34403 | std::tie(Lo, Hi) = DAG.SplitVector(In, dl); | |||
| 34404 | ||||
| 34405 | Lo = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Lo); | |||
| 34406 | Hi = DAG.getNode(X86ISD::VTRUNC, dl, MVT::v16i8, Hi); | |||
| 34407 | SDValue Res = DAG.getVectorShuffle(MVT::v16i8, dl, Lo, Hi, | |||
| 34408 | { 0, 1, 2, 3, 16, 17, 18, 19, | |||
| 34409 | -1, -1, -1, -1, -1, -1, -1, -1 }); | |||
| 34410 | Results.push_back(Res); | |||
| 34411 | return; | |||
| 34412 | } | |||
| 34413 | ||||
| 34414 | return; | |||
| 34415 | } | |||
| 34416 | case ISD::ANY_EXTEND: | |||
| 34417 | // Right now, only MVT::v8i8 has Custom action for an illegal type. | |||
| 34418 | // It's intended to custom handle the input type. | |||
| 34419 | assert(N->getValueType(0) == MVT::v8i8 &&(static_cast <bool> (N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node") ? void ( 0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__ __PRETTY_FUNCTION__)) | |||
| 34420 | "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v8i8 && "Do not know how to legalize this Node") ? void ( 0) : __assert_fail ("N->getValueType(0) == MVT::v8i8 && \"Do not know how to legalize this Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34420, __extension__ __PRETTY_FUNCTION__)); | |||
| 34421 | return; | |||
| 34422 | case ISD::SIGN_EXTEND: | |||
| 34423 | case ISD::ZERO_EXTEND: { | |||
| 34424 | EVT VT = N->getValueType(0); | |||
| 34425 | SDValue In = N->getOperand(0); | |||
| 34426 | EVT InVT = In.getValueType(); | |||
| 34427 | if (!Subtarget.hasSSE41() && VT == MVT::v4i64 && | |||
| 34428 | (InVT == MVT::v4i16 || InVT == MVT::v4i8)){ | |||
| 34429 | assert(getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__ __PRETTY_FUNCTION__)) | |||
| 34430 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), InVT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), InVT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34430, __extension__ __PRETTY_FUNCTION__)); | |||
| 34431 | assert(N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode")(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND && "Unexpected opcode") ? void (0) : __assert_fail ( "N->getOpcode() == ISD::SIGN_EXTEND && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34431, __extension__ __PRETTY_FUNCTION__)); | |||
| 34432 | // Custom split this so we can extend i8/i16->i32 invec. This is better | |||
| 34433 | // since sign_extend_inreg i8/i16->i64 requires an extend to i32 using | |||
| 34434 | // sra. Then extending from i32 to i64 using pcmpgt. By custom splitting | |||
| 34435 | // we allow the sra from the extend to i32 to be shared by the split. | |||
| 34436 | In = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, In); | |||
| 34437 | ||||
| 34438 | // Fill a vector with sign bits for each element. | |||
| 34439 | SDValue Zero = DAG.getConstant(0, dl, MVT::v4i32); | |||
| 34440 | SDValue SignBits = DAG.getSetCC(dl, MVT::v4i32, Zero, In, ISD::SETGT); | |||
| 34441 | ||||
| 34442 | // Create an unpackl and unpackh to interleave the sign bits then bitcast | |||
| 34443 | // to v2i64. | |||
| 34444 | SDValue Lo = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, | |||
| 34445 | {0, 4, 1, 5}); | |||
| 34446 | Lo = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Lo); | |||
| 34447 | SDValue Hi = DAG.getVectorShuffle(MVT::v4i32, dl, In, SignBits, | |||
| 34448 | {2, 6, 3, 7}); | |||
| 34449 | Hi = DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, Hi); | |||
| 34450 | ||||
| 34451 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 34452 | Results.push_back(Res); | |||
| 34453 | return; | |||
| 34454 | } | |||
| 34455 | ||||
| 34456 | if (VT == MVT::v16i32 || VT == MVT::v8i64) { | |||
| 34457 | if (!InVT.is128BitVector()) { | |||
| 34458 | // Not a 128 bit vector, but maybe type legalization will promote | |||
| 34459 | // it to 128 bits. | |||
| 34460 | if (getTypeAction(*DAG.getContext(), InVT) != TypePromoteInteger) | |||
| 34461 | return; | |||
| 34462 | InVT = getTypeToTransformTo(*DAG.getContext(), InVT); | |||
| 34463 | if (!InVT.is128BitVector()) | |||
| 34464 | return; | |||
| 34465 | ||||
| 34466 | // Promote the input to 128 bits. Type legalization will turn this into | |||
| 34467 | // zext_inreg/sext_inreg. | |||
| 34468 | In = DAG.getNode(N->getOpcode(), dl, InVT, In); | |||
| 34469 | } | |||
| 34470 | ||||
| 34471 | // Perform custom splitting instead of the two stage extend we would get | |||
| 34472 | // by default. | |||
| 34473 | EVT LoVT, HiVT; | |||
| 34474 | std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0)); | |||
| 34475 | assert(isTypeLegal(LoVT) && "Split VT not legal?")(static_cast <bool> (isTypeLegal(LoVT) && "Split VT not legal?" ) ? void (0) : __assert_fail ("isTypeLegal(LoVT) && \"Split VT not legal?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34475, __extension__ __PRETTY_FUNCTION__)); | |||
| 34476 | ||||
| 34477 | SDValue Lo = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, LoVT, In, DAG); | |||
| 34478 | ||||
| 34479 | // We need to shift the input over by half the number of elements. | |||
| 34480 | unsigned NumElts = InVT.getVectorNumElements(); | |||
| 34481 | unsigned HalfNumElts = NumElts / 2; | |||
| 34482 | SmallVector<int, 16> ShufMask(NumElts, SM_SentinelUndef); | |||
| 34483 | for (unsigned i = 0; i != HalfNumElts; ++i) | |||
| 34484 | ShufMask[i] = i + HalfNumElts; | |||
| 34485 | ||||
| 34486 | SDValue Hi = DAG.getVectorShuffle(InVT, dl, In, In, ShufMask); | |||
| 34487 | Hi = getEXTEND_VECTOR_INREG(N->getOpcode(), dl, HiVT, Hi, DAG); | |||
| 34488 | ||||
| 34489 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi); | |||
| 34490 | Results.push_back(Res); | |||
| 34491 | } | |||
| 34492 | return; | |||
| 34493 | } | |||
| 34494 | case ISD::FP_TO_SINT: | |||
| 34495 | case ISD::STRICT_FP_TO_SINT: | |||
| 34496 | case ISD::FP_TO_UINT: | |||
| 34497 | case ISD::STRICT_FP_TO_UINT: { | |||
| 34498 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 34499 | bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT || | |||
| 34500 | N->getOpcode() == ISD::STRICT_FP_TO_SINT; | |||
| 34501 | EVT VT = N->getValueType(0); | |||
| 34502 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 34503 | SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); | |||
| 34504 | EVT SrcVT = Src.getValueType(); | |||
| 34505 | ||||
| 34506 | SDValue Res; | |||
| 34507 | if (isSoftFP16(SrcVT)) { | |||
| 34508 | EVT NVT = VT.isVector() ? VT.changeVectorElementType(MVT::f32) : MVT::f32; | |||
| 34509 | if (IsStrict) { | |||
| 34510 | Res = | |||
| 34511 | DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, | |||
| 34512 | {Chain, DAG.getNode(ISD::STRICT_FP_EXTEND, dl, | |||
| 34513 | {NVT, MVT::Other}, {Chain, Src})}); | |||
| 34514 | Chain = Res.getValue(1); | |||
| 34515 | } else { | |||
| 34516 | Res = DAG.getNode(N->getOpcode(), dl, VT, | |||
| 34517 | DAG.getNode(ISD::FP_EXTEND, dl, NVT, Src)); | |||
| 34518 | } | |||
| 34519 | Results.push_back(Res); | |||
| 34520 | if (IsStrict) | |||
| 34521 | Results.push_back(Chain); | |||
| 34522 | ||||
| 34523 | return; | |||
| 34524 | } | |||
| 34525 | ||||
| 34526 | if (VT.isVector() && Subtarget.hasFP16() && | |||
| 34527 | SrcVT.getVectorElementType() == MVT::f16) { | |||
| 34528 | EVT EleVT = VT.getVectorElementType(); | |||
| 34529 | EVT ResVT = EleVT == MVT::i32 ? MVT::v4i32 : MVT::v8i16; | |||
| 34530 | ||||
| 34531 | if (SrcVT != MVT::v8f16) { | |||
| 34532 | SDValue Tmp = | |||
| 34533 | IsStrict ? DAG.getConstantFP(0.0, dl, SrcVT) : DAG.getUNDEF(SrcVT); | |||
| 34534 | SmallVector<SDValue, 4> Ops(SrcVT == MVT::v2f16 ? 4 : 2, Tmp); | |||
| 34535 | Ops[0] = Src; | |||
| 34536 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f16, Ops); | |||
| 34537 | } | |||
| 34538 | ||||
| 34539 | if (IsStrict) { | |||
| 34540 | unsigned Opc = | |||
| 34541 | IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; | |||
| 34542 | Res = | |||
| 34543 | DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {N->getOperand(0), Src}); | |||
| 34544 | Chain = Res.getValue(1); | |||
| 34545 | } else { | |||
| 34546 | unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; | |||
| 34547 | Res = DAG.getNode(Opc, dl, ResVT, Src); | |||
| 34548 | } | |||
| 34549 | ||||
| 34550 | // TODO: Need to add exception check code for strict FP. | |||
| 34551 | if (EleVT.getSizeInBits() < 16) { | |||
| 34552 | MVT TmpVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8); | |||
| 34553 | Res = DAG.getNode(ISD::TRUNCATE, dl, TmpVT, Res); | |||
| 34554 | ||||
| 34555 | // Now widen to 128 bits. | |||
| 34556 | unsigned NumConcats = 128 / TmpVT.getSizeInBits(); | |||
| 34557 | MVT ConcatVT = MVT::getVectorVT(EleVT.getSimpleVT(), 8 * NumConcats); | |||
| 34558 | SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(TmpVT)); | |||
| 34559 | ConcatOps[0] = Res; | |||
| 34560 | Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); | |||
| 34561 | } | |||
| 34562 | ||||
| 34563 | Results.push_back(Res); | |||
| 34564 | if (IsStrict) | |||
| 34565 | Results.push_back(Chain); | |||
| 34566 | ||||
| 34567 | return; | |||
| 34568 | } | |||
| 34569 | ||||
| 34570 | if (VT.isVector() && VT.getScalarSizeInBits() < 32) { | |||
| 34571 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__ __PRETTY_FUNCTION__)) | |||
| 34572 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34572, __extension__ __PRETTY_FUNCTION__)); | |||
| 34573 | ||||
| 34574 | // Try to create a 128 bit vector, but don't exceed a 32 bit element. | |||
| 34575 | unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U); | |||
| 34576 | MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth), | |||
| 34577 | VT.getVectorNumElements()); | |||
| 34578 | SDValue Res; | |||
| 34579 | SDValue Chain; | |||
| 34580 | if (IsStrict) { | |||
| 34581 | Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other}, | |||
| 34582 | {N->getOperand(0), Src}); | |||
| 34583 | Chain = Res.getValue(1); | |||
| 34584 | } else | |||
| 34585 | Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src); | |||
| 34586 | ||||
| 34587 | // Preserve what we know about the size of the original result. If the | |||
| 34588 | // result is v2i32, we have to manually widen the assert. | |||
| 34589 | if (PromoteVT == MVT::v2i32) | |||
| 34590 | Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Res, | |||
| 34591 | DAG.getUNDEF(MVT::v2i32)); | |||
| 34592 | ||||
| 34593 | Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext, dl, | |||
| 34594 | Res.getValueType(), Res, | |||
| 34595 | DAG.getValueType(VT.getVectorElementType())); | |||
| 34596 | ||||
| 34597 | if (PromoteVT == MVT::v2i32) | |||
| 34598 | Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, | |||
| 34599 | DAG.getIntPtrConstant(0, dl)); | |||
| 34600 | ||||
| 34601 | // Truncate back to the original width. | |||
| 34602 | Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res); | |||
| 34603 | ||||
| 34604 | // Now widen to 128 bits. | |||
| 34605 | unsigned NumConcats = 128 / VT.getSizeInBits(); | |||
| 34606 | MVT ConcatVT = MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(), | |||
| 34607 | VT.getVectorNumElements() * NumConcats); | |||
| 34608 | SmallVector<SDValue, 8> ConcatOps(NumConcats, DAG.getUNDEF(VT)); | |||
| 34609 | ConcatOps[0] = Res; | |||
| 34610 | Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps); | |||
| 34611 | Results.push_back(Res); | |||
| 34612 | if (IsStrict) | |||
| 34613 | Results.push_back(Chain); | |||
| 34614 | return; | |||
| 34615 | } | |||
| 34616 | ||||
| 34617 | ||||
| 34618 | if (VT == MVT::v2i32) { | |||
| 34619 | assert((!IsStrict || IsSigned || Subtarget.hasAVX512()) &&(static_cast <bool> ((!IsStrict || IsSigned || Subtarget .hasAVX512()) && "Strict unsigned conversion requires AVX512" ) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__ __PRETTY_FUNCTION__)) | |||
| 34620 | "Strict unsigned conversion requires AVX512")(static_cast <bool> ((!IsStrict || IsSigned || Subtarget .hasAVX512()) && "Strict unsigned conversion requires AVX512" ) ? void (0) : __assert_fail ("(!IsStrict || IsSigned || Subtarget.hasAVX512()) && \"Strict unsigned conversion requires AVX512\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34620, __extension__ __PRETTY_FUNCTION__)); | |||
| 34621 | assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34621, __extension__ __PRETTY_FUNCTION__)); | |||
| 34622 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__ __PRETTY_FUNCTION__)) | |||
| 34623 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34623, __extension__ __PRETTY_FUNCTION__)); | |||
| 34624 | if (Src.getValueType() == MVT::v2f64) { | |||
| 34625 | if (!IsSigned && !Subtarget.hasAVX512()) { | |||
| 34626 | SDValue Res = | |||
| 34627 | expandFP_TO_UINT_SSE(MVT::v4i32, Src, dl, DAG, Subtarget); | |||
| 34628 | Results.push_back(Res); | |||
| 34629 | return; | |||
| 34630 | } | |||
| 34631 | ||||
| 34632 | unsigned Opc; | |||
| 34633 | if (IsStrict) | |||
| 34634 | Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; | |||
| 34635 | else | |||
| 34636 | Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; | |||
| 34637 | ||||
| 34638 | // If we have VLX we can emit a target specific FP_TO_UINT node,. | |||
| 34639 | if (!IsSigned && !Subtarget.hasVLX()) { | |||
| 34640 | // Otherwise we can defer to the generic legalizer which will widen | |||
| 34641 | // the input as well. This will be further widened during op | |||
| 34642 | // legalization to v8i32<-v8f64. | |||
| 34643 | // For strict nodes we'll need to widen ourselves. | |||
| 34644 | // FIXME: Fix the type legalizer to safely widen strict nodes? | |||
| 34645 | if (!IsStrict) | |||
| 34646 | return; | |||
| 34647 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src, | |||
| 34648 | DAG.getConstantFP(0.0, dl, MVT::v2f64)); | |||
| 34649 | Opc = N->getOpcode(); | |||
| 34650 | } | |||
| 34651 | SDValue Res; | |||
| 34652 | SDValue Chain; | |||
| 34653 | if (IsStrict) { | |||
| 34654 | Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other}, | |||
| 34655 | {N->getOperand(0), Src}); | |||
| 34656 | Chain = Res.getValue(1); | |||
| 34657 | } else { | |||
| 34658 | Res = DAG.getNode(Opc, dl, MVT::v4i32, Src); | |||
| 34659 | } | |||
| 34660 | Results.push_back(Res); | |||
| 34661 | if (IsStrict) | |||
| 34662 | Results.push_back(Chain); | |||
| 34663 | return; | |||
| 34664 | } | |||
| 34665 | ||||
| 34666 | // Custom widen strict v2f32->v2i32 by padding with zeros. | |||
| 34667 | // FIXME: Should generic type legalizer do this? | |||
| 34668 | if (Src.getValueType() == MVT::v2f32 && IsStrict) { | |||
| 34669 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, | |||
| 34670 | DAG.getConstantFP(0.0, dl, MVT::v2f32)); | |||
| 34671 | SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other}, | |||
| 34672 | {N->getOperand(0), Src}); | |||
| 34673 | Results.push_back(Res); | |||
| 34674 | Results.push_back(Res.getValue(1)); | |||
| 34675 | return; | |||
| 34676 | } | |||
| 34677 | ||||
| 34678 | // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs, | |||
| 34679 | // so early out here. | |||
| 34680 | return; | |||
| 34681 | } | |||
| 34682 | ||||
| 34683 | assert(!VT.isVector() && "Vectors should have been handled above!")(static_cast <bool> (!VT.isVector() && "Vectors should have been handled above!" ) ? void (0) : __assert_fail ("!VT.isVector() && \"Vectors should have been handled above!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34683, __extension__ __PRETTY_FUNCTION__)); | |||
| 34684 | ||||
| 34685 | if ((Subtarget.hasDQI() && VT == MVT::i64 && | |||
| 34686 | (SrcVT == MVT::f32 || SrcVT == MVT::f64)) || | |||
| 34687 | (Subtarget.hasFP16() && SrcVT == MVT::f16)) { | |||
| 34688 | assert(!Subtarget.is64Bit() && "i64 should be legal")(static_cast <bool> (!Subtarget.is64Bit() && "i64 should be legal" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"i64 should be legal\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34688, __extension__ __PRETTY_FUNCTION__)); | |||
| 34689 | unsigned NumElts = Subtarget.hasVLX() ? 2 : 8; | |||
| 34690 | // If we use a 128-bit result we might need to use a target specific node. | |||
| 34691 | unsigned SrcElts = | |||
| 34692 | std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits()); | |||
| 34693 | MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts); | |||
| 34694 | MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts); | |||
| 34695 | unsigned Opc = N->getOpcode(); | |||
| 34696 | if (NumElts != SrcElts) { | |||
| 34697 | if (IsStrict) | |||
| 34698 | Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI; | |||
| 34699 | else | |||
| 34700 | Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; | |||
| 34701 | } | |||
| 34702 | ||||
| 34703 | SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); | |||
| 34704 | SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT, | |||
| 34705 | DAG.getConstantFP(0.0, dl, VecInVT), Src, | |||
| 34706 | ZeroIdx); | |||
| 34707 | SDValue Chain; | |||
| 34708 | if (IsStrict) { | |||
| 34709 | SDVTList Tys = DAG.getVTList(VecVT, MVT::Other); | |||
| 34710 | Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res); | |||
| 34711 | Chain = Res.getValue(1); | |||
| 34712 | } else | |||
| 34713 | Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res); | |||
| 34714 | Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx); | |||
| 34715 | Results.push_back(Res); | |||
| 34716 | if (IsStrict) | |||
| 34717 | Results.push_back(Chain); | |||
| 34718 | return; | |||
| 34719 | } | |||
| 34720 | ||||
| 34721 | if (VT == MVT::i128 && Subtarget.isTargetWin64()) { | |||
| 34722 | SDValue Chain; | |||
| 34723 | SDValue V = LowerWin64_FP_TO_INT128(SDValue(N, 0), DAG, Chain); | |||
| 34724 | Results.push_back(V); | |||
| 34725 | if (IsStrict) | |||
| 34726 | Results.push_back(Chain); | |||
| 34727 | return; | |||
| 34728 | } | |||
| 34729 | ||||
| 34730 | if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) { | |||
| 34731 | Results.push_back(V); | |||
| 34732 | if (IsStrict) | |||
| 34733 | Results.push_back(Chain); | |||
| 34734 | } | |||
| 34735 | return; | |||
| 34736 | } | |||
| 34737 | case ISD::LRINT: | |||
| 34738 | case ISD::LLRINT: { | |||
| 34739 | if (SDValue V = LRINT_LLRINTHelper(N, DAG)) | |||
| 34740 | Results.push_back(V); | |||
| 34741 | return; | |||
| 34742 | } | |||
| 34743 | ||||
| 34744 | case ISD::SINT_TO_FP: | |||
| 34745 | case ISD::STRICT_SINT_TO_FP: | |||
| 34746 | case ISD::UINT_TO_FP: | |||
| 34747 | case ISD::STRICT_UINT_TO_FP: { | |||
| 34748 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 34749 | bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP || | |||
| 34750 | N->getOpcode() == ISD::STRICT_SINT_TO_FP; | |||
| 34751 | EVT VT = N->getValueType(0); | |||
| 34752 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 34753 | if (VT.getVectorElementType() == MVT::f16 && Subtarget.hasFP16() && | |||
| 34754 | Subtarget.hasVLX()) { | |||
| 34755 | if (Src.getValueType().getVectorElementType() == MVT::i16) | |||
| 34756 | return; | |||
| 34757 | ||||
| 34758 | if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2i32) | |||
| 34759 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, | |||
| 34760 | IsStrict ? DAG.getConstant(0, dl, MVT::v2i32) | |||
| 34761 | : DAG.getUNDEF(MVT::v2i32)); | |||
| 34762 | if (IsStrict) { | |||
| 34763 | unsigned Opc = | |||
| 34764 | IsSigned ? X86ISD::STRICT_CVTSI2P : X86ISD::STRICT_CVTUI2P; | |||
| 34765 | SDValue Res = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, | |||
| 34766 | {N->getOperand(0), Src}); | |||
| 34767 | Results.push_back(Res); | |||
| 34768 | Results.push_back(Res.getValue(1)); | |||
| 34769 | } else { | |||
| 34770 | unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; | |||
| 34771 | Results.push_back(DAG.getNode(Opc, dl, MVT::v8f16, Src)); | |||
| 34772 | } | |||
| 34773 | return; | |||
| 34774 | } | |||
| 34775 | if (VT != MVT::v2f32) | |||
| 34776 | return; | |||
| 34777 | EVT SrcVT = Src.getValueType(); | |||
| 34778 | if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) { | |||
| 34779 | if (IsStrict) { | |||
| 34780 | unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P | |||
| 34781 | : X86ISD::STRICT_CVTUI2P; | |||
| 34782 | SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other}, | |||
| 34783 | {N->getOperand(0), Src}); | |||
| 34784 | Results.push_back(Res); | |||
| 34785 | Results.push_back(Res.getValue(1)); | |||
| 34786 | } else { | |||
| 34787 | unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P; | |||
| 34788 | Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src)); | |||
| 34789 | } | |||
| 34790 | return; | |||
| 34791 | } | |||
| 34792 | if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() && | |||
| 34793 | Subtarget.hasSSE41() && !Subtarget.hasAVX512()) { | |||
| 34794 | SDValue Zero = DAG.getConstant(0, dl, SrcVT); | |||
| 34795 | SDValue One = DAG.getConstant(1, dl, SrcVT); | |||
| 34796 | SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT, | |||
| 34797 | DAG.getNode(ISD::SRL, dl, SrcVT, Src, One), | |||
| 34798 | DAG.getNode(ISD::AND, dl, SrcVT, Src, One)); | |||
| 34799 | SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT); | |||
| 34800 | SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src); | |||
| 34801 | SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32)); | |||
| 34802 | for (int i = 0; i != 2; ++i) { | |||
| 34803 | SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, | |||
| 34804 | SignSrc, DAG.getIntPtrConstant(i, dl)); | |||
| 34805 | if (IsStrict) | |||
| 34806 | SignCvts[i] = | |||
| 34807 | DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other}, | |||
| 34808 | {N->getOperand(0), Elt}); | |||
| 34809 | else | |||
| 34810 | SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt); | |||
| 34811 | }; | |||
| 34812 | SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts); | |||
| 34813 | SDValue Slow, Chain; | |||
| 34814 | if (IsStrict) { | |||
| 34815 | Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, | |||
| 34816 | SignCvts[0].getValue(1), SignCvts[1].getValue(1)); | |||
| 34817 | Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other}, | |||
| 34818 | {Chain, SignCvt, SignCvt}); | |||
| 34819 | Chain = Slow.getValue(1); | |||
| 34820 | } else { | |||
| 34821 | Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt); | |||
| 34822 | } | |||
| 34823 | IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg); | |||
| 34824 | IsNeg = | |||
| 34825 | DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1}); | |||
| 34826 | SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt); | |||
| 34827 | Results.push_back(Cvt); | |||
| 34828 | if (IsStrict) | |||
| 34829 | Results.push_back(Chain); | |||
| 34830 | return; | |||
| 34831 | } | |||
| 34832 | ||||
| 34833 | if (SrcVT != MVT::v2i32) | |||
| 34834 | return; | |||
| 34835 | ||||
| 34836 | if (IsSigned || Subtarget.hasAVX512()) { | |||
| 34837 | if (!IsStrict) | |||
| 34838 | return; | |||
| 34839 | ||||
| 34840 | // Custom widen strict v2i32->v2f32 to avoid scalarization. | |||
| 34841 | // FIXME: Should generic type legalizer do this? | |||
| 34842 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src, | |||
| 34843 | DAG.getConstant(0, dl, MVT::v2i32)); | |||
| 34844 | SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, | |||
| 34845 | {N->getOperand(0), Src}); | |||
| 34846 | Results.push_back(Res); | |||
| 34847 | Results.push_back(Res.getValue(1)); | |||
| 34848 | return; | |||
| 34849 | } | |||
| 34850 | ||||
| 34851 | assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34851, __extension__ __PRETTY_FUNCTION__)); | |||
| 34852 | SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src); | |||
| 34853 | SDValue VBias = DAG.getConstantFP( | |||
| 34854 | llvm::bit_cast<double>(0x4330000000000000ULL), dl, MVT::v2f64); | |||
| 34855 | SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn, | |||
| 34856 | DAG.getBitcast(MVT::v2i64, VBias)); | |||
| 34857 | Or = DAG.getBitcast(MVT::v2f64, Or); | |||
| 34858 | if (IsStrict) { | |||
| 34859 | SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other}, | |||
| 34860 | {N->getOperand(0), Or, VBias}); | |||
| 34861 | SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, | |||
| 34862 | {MVT::v4f32, MVT::Other}, | |||
| 34863 | {Sub.getValue(1), Sub}); | |||
| 34864 | Results.push_back(Res); | |||
| 34865 | Results.push_back(Res.getValue(1)); | |||
| 34866 | } else { | |||
| 34867 | // TODO: Are there any fast-math-flags to propagate here? | |||
| 34868 | SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias); | |||
| 34869 | Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub)); | |||
| 34870 | } | |||
| 34871 | return; | |||
| 34872 | } | |||
| 34873 | case ISD::STRICT_FP_ROUND: | |||
| 34874 | case ISD::FP_ROUND: { | |||
| 34875 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 34876 | SDValue Chain = IsStrict ? N->getOperand(0) : SDValue(); | |||
| 34877 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 34878 | SDValue Rnd = N->getOperand(IsStrict ? 2 : 1); | |||
| 34879 | EVT SrcVT = Src.getValueType(); | |||
| 34880 | EVT VT = N->getValueType(0); | |||
| 34881 | SDValue V; | |||
| 34882 | if (VT == MVT::v2f16 && Src.getValueType() == MVT::v2f32) { | |||
| 34883 | SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f32) | |||
| 34884 | : DAG.getUNDEF(MVT::v2f32); | |||
| 34885 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, Ext); | |||
| 34886 | } | |||
| 34887 | if (!Subtarget.hasFP16() && VT.getVectorElementType() == MVT::f16) { | |||
| 34888 | assert(Subtarget.hasF16C() && "Cannot widen f16 without F16C")(static_cast <bool> (Subtarget.hasF16C() && "Cannot widen f16 without F16C" ) ? void (0) : __assert_fail ("Subtarget.hasF16C() && \"Cannot widen f16 without F16C\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34888, __extension__ __PRETTY_FUNCTION__)); | |||
| 34889 | if (SrcVT.getVectorElementType() != MVT::f32) | |||
| 34890 | return; | |||
| 34891 | ||||
| 34892 | if (IsStrict) | |||
| 34893 | V = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other}, | |||
| 34894 | {Chain, Src, Rnd}); | |||
| 34895 | else | |||
| 34896 | V = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Src, Rnd); | |||
| 34897 | ||||
| 34898 | Results.push_back(DAG.getBitcast(MVT::v8f16, V)); | |||
| 34899 | if (IsStrict) | |||
| 34900 | Results.push_back(V.getValue(1)); | |||
| 34901 | return; | |||
| 34902 | } | |||
| 34903 | if (!isTypeLegal(Src.getValueType())) | |||
| 34904 | return; | |||
| 34905 | EVT NewVT = VT.getVectorElementType() == MVT::f16 ? MVT::v8f16 : MVT::v4f32; | |||
| 34906 | if (IsStrict) | |||
| 34907 | V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {NewVT, MVT::Other}, | |||
| 34908 | {Chain, Src}); | |||
| 34909 | else | |||
| 34910 | V = DAG.getNode(X86ISD::VFPROUND, dl, NewVT, Src); | |||
| 34911 | Results.push_back(V); | |||
| 34912 | if (IsStrict) | |||
| 34913 | Results.push_back(V.getValue(1)); | |||
| 34914 | return; | |||
| 34915 | } | |||
| 34916 | case ISD::FP_EXTEND: | |||
| 34917 | case ISD::STRICT_FP_EXTEND: { | |||
| 34918 | // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND. | |||
| 34919 | // No other ValueType for FP_EXTEND should reach this point. | |||
| 34920 | assert(N->getValueType(0) == MVT::v2f32 &&(static_cast <bool> (N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node") ? void ( 0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__ __PRETTY_FUNCTION__)) | |||
| 34921 | "Do not know how to legalize this Node")(static_cast <bool> (N->getValueType(0) == MVT::v2f32 && "Do not know how to legalize this Node") ? void ( 0) : __assert_fail ("N->getValueType(0) == MVT::v2f32 && \"Do not know how to legalize this Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34921, __extension__ __PRETTY_FUNCTION__)); | |||
| 34922 | if (!Subtarget.hasFP16() || !Subtarget.hasVLX()) | |||
| 34923 | return; | |||
| 34924 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 34925 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 34926 | SDValue Ext = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v2f16) | |||
| 34927 | : DAG.getUNDEF(MVT::v2f16); | |||
| 34928 | SDValue V = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f16, Src, Ext); | |||
| 34929 | if (IsStrict) | |||
| 34930 | V = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {MVT::v4f32, MVT::Other}, | |||
| 34931 | {N->getOperand(0), V}); | |||
| 34932 | else | |||
| 34933 | V = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v4f32, V); | |||
| 34934 | Results.push_back(V); | |||
| 34935 | if (IsStrict) | |||
| 34936 | Results.push_back(V.getValue(1)); | |||
| 34937 | return; | |||
| 34938 | } | |||
| 34939 | case ISD::INTRINSIC_W_CHAIN: { | |||
| 34940 | unsigned IntNo = N->getConstantOperandVal(1); | |||
| 34941 | switch (IntNo) { | |||
| 34942 | default : llvm_unreachable("Do not know how to custom type "::llvm::llvm_unreachable_internal("Do not know how to custom type " "legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 34943) | |||
| 34943 | "legalize this intrinsic operation!")::llvm::llvm_unreachable_internal("Do not know how to custom type " "legalize this intrinsic operation!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 34943); | |||
| 34944 | case Intrinsic::x86_rdtsc: | |||
| 34945 | return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, | |||
| 34946 | Results); | |||
| 34947 | case Intrinsic::x86_rdtscp: | |||
| 34948 | return getReadTimeStampCounter(N, dl, X86::RDTSCP, DAG, Subtarget, | |||
| 34949 | Results); | |||
| 34950 | case Intrinsic::x86_rdpmc: | |||
| 34951 | expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPMC, X86::ECX, Subtarget, | |||
| 34952 | Results); | |||
| 34953 | return; | |||
| 34954 | case Intrinsic::x86_rdpru: | |||
| 34955 | expandIntrinsicWChainHelper(N, dl, DAG, X86::RDPRU, X86::ECX, Subtarget, | |||
| 34956 | Results); | |||
| 34957 | return; | |||
| 34958 | case Intrinsic::x86_xgetbv: | |||
| 34959 | expandIntrinsicWChainHelper(N, dl, DAG, X86::XGETBV, X86::ECX, Subtarget, | |||
| 34960 | Results); | |||
| 34961 | return; | |||
| 34962 | } | |||
| 34963 | } | |||
| 34964 | case ISD::READCYCLECOUNTER: { | |||
| 34965 | return getReadTimeStampCounter(N, dl, X86::RDTSC, DAG, Subtarget, Results); | |||
| 34966 | } | |||
| 34967 | case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: { | |||
| 34968 | EVT T = N->getValueType(0); | |||
| 34969 | assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair")(static_cast <bool> ((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair") ? void (0) : __assert_fail ( "(T == MVT::i64 || T == MVT::i128) && \"can only expand cmpxchg pair\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34969, __extension__ __PRETTY_FUNCTION__)); | |||
| 34970 | bool Regs64bit = T == MVT::i128; | |||
| 34971 | assert((!Regs64bit || Subtarget.canUseCMPXCHG16B()) &&(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B ()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B" ) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__ __PRETTY_FUNCTION__)) | |||
| 34972 | "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B")(static_cast <bool> ((!Regs64bit || Subtarget.canUseCMPXCHG16B ()) && "64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B" ) ? void (0) : __assert_fail ("(!Regs64bit || Subtarget.canUseCMPXCHG16B()) && \"64-bit ATOMIC_CMP_SWAP_WITH_SUCCESS requires CMPXCHG16B\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 34972, __extension__ __PRETTY_FUNCTION__)); | |||
| 34973 | MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32; | |||
| 34974 | SDValue cpInL, cpInH; | |||
| 34975 | std::tie(cpInL, cpInH) = | |||
| 34976 | DAG.SplitScalar(N->getOperand(2), dl, HalfT, HalfT); | |||
| 34977 | cpInL = DAG.getCopyToReg(N->getOperand(0), dl, | |||
| 34978 | Regs64bit ? X86::RAX : X86::EAX, cpInL, SDValue()); | |||
| 34979 | cpInH = | |||
| 34980 | DAG.getCopyToReg(cpInL.getValue(0), dl, Regs64bit ? X86::RDX : X86::EDX, | |||
| 34981 | cpInH, cpInL.getValue(1)); | |||
| 34982 | SDValue swapInL, swapInH; | |||
| 34983 | std::tie(swapInL, swapInH) = | |||
| 34984 | DAG.SplitScalar(N->getOperand(3), dl, HalfT, HalfT); | |||
| 34985 | swapInH = | |||
| 34986 | DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX, | |||
| 34987 | swapInH, cpInH.getValue(1)); | |||
| 34988 | ||||
| 34989 | // In 64-bit mode we might need the base pointer in RBX, but we can't know | |||
| 34990 | // until later. So we keep the RBX input in a vreg and use a custom | |||
| 34991 | // inserter. | |||
| 34992 | // Since RBX will be a reserved register the register allocator will not | |||
| 34993 | // make sure its value will be properly saved and restored around this | |||
| 34994 | // live-range. | |||
| 34995 | SDValue Result; | |||
| 34996 | SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); | |||
| 34997 | MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand(); | |||
| 34998 | if (Regs64bit) { | |||
| 34999 | SDValue Ops[] = {swapInH.getValue(0), N->getOperand(1), swapInL, | |||
| 35000 | swapInH.getValue(1)}; | |||
| 35001 | Result = | |||
| 35002 | DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG16_DAG, dl, Tys, Ops, T, MMO); | |||
| 35003 | } else { | |||
| 35004 | swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl, X86::EBX, swapInL, | |||
| 35005 | swapInH.getValue(1)); | |||
| 35006 | SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1), | |||
| 35007 | swapInL.getValue(1)}; | |||
| 35008 | Result = | |||
| 35009 | DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG8_DAG, dl, Tys, Ops, T, MMO); | |||
| 35010 | } | |||
| 35011 | ||||
| 35012 | SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl, | |||
| 35013 | Regs64bit ? X86::RAX : X86::EAX, | |||
| 35014 | HalfT, Result.getValue(1)); | |||
| 35015 | SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl, | |||
| 35016 | Regs64bit ? X86::RDX : X86::EDX, | |||
| 35017 | HalfT, cpOutL.getValue(2)); | |||
| 35018 | SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)}; | |||
| 35019 | ||||
| 35020 | SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS, | |||
| 35021 | MVT::i32, cpOutH.getValue(2)); | |||
| 35022 | SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG); | |||
| 35023 | Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1)); | |||
| 35024 | ||||
| 35025 | Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF)); | |||
| 35026 | Results.push_back(Success); | |||
| 35027 | Results.push_back(EFLAGS.getValue(1)); | |||
| 35028 | return; | |||
| 35029 | } | |||
| 35030 | case ISD::ATOMIC_LOAD: { | |||
| 35031 | assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 && "Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35031, __extension__ __PRETTY_FUNCTION__)); | |||
| 35032 | bool NoImplicitFloatOps = | |||
| 35033 | DAG.getMachineFunction().getFunction().hasFnAttribute( | |||
| 35034 | Attribute::NoImplicitFloat); | |||
| 35035 | if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) { | |||
| 35036 | auto *Node = cast<AtomicSDNode>(N); | |||
| 35037 | if (Subtarget.hasSSE1()) { | |||
| 35038 | // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS. | |||
| 35039 | // Then extract the lower 64-bits. | |||
| 35040 | MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32; | |||
| 35041 | SDVTList Tys = DAG.getVTList(LdVT, MVT::Other); | |||
| 35042 | SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; | |||
| 35043 | SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, | |||
| 35044 | MVT::i64, Node->getMemOperand()); | |||
| 35045 | if (Subtarget.hasSSE2()) { | |||
| 35046 | SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld, | |||
| 35047 | DAG.getIntPtrConstant(0, dl)); | |||
| 35048 | Results.push_back(Res); | |||
| 35049 | Results.push_back(Ld.getValue(1)); | |||
| 35050 | return; | |||
| 35051 | } | |||
| 35052 | // We use an alternative sequence for SSE1 that extracts as v2f32 and | |||
| 35053 | // then casts to i64. This avoids a 128-bit stack temporary being | |||
| 35054 | // created by type legalization if we were to cast v4f32->v2i64. | |||
| 35055 | SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld, | |||
| 35056 | DAG.getIntPtrConstant(0, dl)); | |||
| 35057 | Res = DAG.getBitcast(MVT::i64, Res); | |||
| 35058 | Results.push_back(Res); | |||
| 35059 | Results.push_back(Ld.getValue(1)); | |||
| 35060 | return; | |||
| 35061 | } | |||
| 35062 | if (Subtarget.hasX87()) { | |||
| 35063 | // First load this into an 80-bit X87 register. This will put the whole | |||
| 35064 | // integer into the significand. | |||
| 35065 | SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other); | |||
| 35066 | SDValue Ops[] = { Node->getChain(), Node->getBasePtr() }; | |||
| 35067 | SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD, | |||
| 35068 | dl, Tys, Ops, MVT::i64, | |||
| 35069 | Node->getMemOperand()); | |||
| 35070 | SDValue Chain = Result.getValue(1); | |||
| 35071 | ||||
| 35072 | // Now store the X87 register to a stack temporary and convert to i64. | |||
| 35073 | // This store is not atomic and doesn't need to be. | |||
| 35074 | // FIXME: We don't need a stack temporary if the result of the load | |||
| 35075 | // is already being stored. We could just directly store there. | |||
| 35076 | SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64); | |||
| 35077 | int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex(); | |||
| 35078 | MachinePointerInfo MPI = | |||
| 35079 | MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI); | |||
| 35080 | SDValue StoreOps[] = { Chain, Result, StackPtr }; | |||
| 35081 | Chain = DAG.getMemIntrinsicNode( | |||
| 35082 | X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64, | |||
| 35083 | MPI, std::nullopt /*Align*/, MachineMemOperand::MOStore); | |||
| 35084 | ||||
| 35085 | // Finally load the value back from the stack temporary and return it. | |||
| 35086 | // This load is not atomic and doesn't need to be. | |||
| 35087 | // This load will be further type legalized. | |||
| 35088 | Result = DAG.getLoad(MVT::i64, dl, Chain, StackPtr, MPI); | |||
| 35089 | Results.push_back(Result); | |||
| 35090 | Results.push_back(Result.getValue(1)); | |||
| 35091 | return; | |||
| 35092 | } | |||
| 35093 | } | |||
| 35094 | // TODO: Use MOVLPS when SSE1 is available? | |||
| 35095 | // Delegate to generic TypeLegalization. Situations we can really handle | |||
| 35096 | // should have already been dealt with by AtomicExpandPass.cpp. | |||
| 35097 | break; | |||
| 35098 | } | |||
| 35099 | case ISD::ATOMIC_SWAP: | |||
| 35100 | case ISD::ATOMIC_LOAD_ADD: | |||
| 35101 | case ISD::ATOMIC_LOAD_SUB: | |||
| 35102 | case ISD::ATOMIC_LOAD_AND: | |||
| 35103 | case ISD::ATOMIC_LOAD_OR: | |||
| 35104 | case ISD::ATOMIC_LOAD_XOR: | |||
| 35105 | case ISD::ATOMIC_LOAD_NAND: | |||
| 35106 | case ISD::ATOMIC_LOAD_MIN: | |||
| 35107 | case ISD::ATOMIC_LOAD_MAX: | |||
| 35108 | case ISD::ATOMIC_LOAD_UMIN: | |||
| 35109 | case ISD::ATOMIC_LOAD_UMAX: | |||
| 35110 | // Delegate to generic TypeLegalization. Situations we can really handle | |||
| 35111 | // should have already been dealt with by AtomicExpandPass.cpp. | |||
| 35112 | break; | |||
| 35113 | ||||
| 35114 | case ISD::BITCAST: { | |||
| 35115 | assert(Subtarget.hasSSE2() && "Requires at least SSE2!")(static_cast <bool> (Subtarget.hasSSE2() && "Requires at least SSE2!" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires at least SSE2!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35115, __extension__ __PRETTY_FUNCTION__)); | |||
| 35116 | EVT DstVT = N->getValueType(0); | |||
| 35117 | EVT SrcVT = N->getOperand(0).getValueType(); | |||
| 35118 | ||||
| 35119 | // If this is a bitcast from a v64i1 k-register to a i64 on a 32-bit target | |||
| 35120 | // we can split using the k-register rather than memory. | |||
| 35121 | if (SrcVT == MVT::v64i1 && DstVT == MVT::i64 && Subtarget.hasBWI()) { | |||
| 35122 | assert(!Subtarget.is64Bit() && "Expected 32-bit mode")(static_cast <bool> (!Subtarget.is64Bit() && "Expected 32-bit mode" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Expected 32-bit mode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35122, __extension__ __PRETTY_FUNCTION__)); | |||
| 35123 | SDValue Lo, Hi; | |||
| 35124 | std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); | |||
| 35125 | Lo = DAG.getBitcast(MVT::i32, Lo); | |||
| 35126 | Hi = DAG.getBitcast(MVT::i32, Hi); | |||
| 35127 | SDValue Res = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi); | |||
| 35128 | Results.push_back(Res); | |||
| 35129 | return; | |||
| 35130 | } | |||
| 35131 | ||||
| 35132 | if (DstVT.isVector() && SrcVT == MVT::x86mmx) { | |||
| 35133 | // FIXME: Use v4f32 for SSE1? | |||
| 35134 | assert(Subtarget.hasSSE2() && "Requires SSE2")(static_cast <bool> (Subtarget.hasSSE2() && "Requires SSE2" ) ? void (0) : __assert_fail ("Subtarget.hasSSE2() && \"Requires SSE2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35134, __extension__ __PRETTY_FUNCTION__)); | |||
| 35135 | assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__ __PRETTY_FUNCTION__)) | |||
| 35136 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), DstVT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35136, __extension__ __PRETTY_FUNCTION__)); | |||
| 35137 | EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT); | |||
| 35138 | SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64, | |||
| 35139 | N->getOperand(0)); | |||
| 35140 | Res = DAG.getBitcast(WideVT, Res); | |||
| 35141 | Results.push_back(Res); | |||
| 35142 | return; | |||
| 35143 | } | |||
| 35144 | ||||
| 35145 | return; | |||
| 35146 | } | |||
| 35147 | case ISD::MGATHER: { | |||
| 35148 | EVT VT = N->getValueType(0); | |||
| 35149 | if ((VT == MVT::v2f32 || VT == MVT::v2i32) && | |||
| 35150 | (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { | |||
| 35151 | auto *Gather = cast<MaskedGatherSDNode>(N); | |||
| 35152 | SDValue Index = Gather->getIndex(); | |||
| 35153 | if (Index.getValueType() != MVT::v2i64) | |||
| 35154 | return; | |||
| 35155 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__ __PRETTY_FUNCTION__)) | |||
| 35156 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35156, __extension__ __PRETTY_FUNCTION__)); | |||
| 35157 | EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); | |||
| 35158 | SDValue Mask = Gather->getMask(); | |||
| 35159 | assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type")(static_cast <bool> (Mask.getValueType() == MVT::v2i1 && "Unexpected mask type") ? void (0) : __assert_fail ("Mask.getValueType() == MVT::v2i1 && \"Unexpected mask type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35159, __extension__ __PRETTY_FUNCTION__)); | |||
| 35160 | SDValue PassThru = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, | |||
| 35161 | Gather->getPassThru(), | |||
| 35162 | DAG.getUNDEF(VT)); | |||
| 35163 | if (!Subtarget.hasVLX()) { | |||
| 35164 | // We need to widen the mask, but the instruction will only use 2 | |||
| 35165 | // of its elements. So we can use undef. | |||
| 35166 | Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, | |||
| 35167 | DAG.getUNDEF(MVT::v2i1)); | |||
| 35168 | Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); | |||
| 35169 | } | |||
| 35170 | SDValue Ops[] = { Gather->getChain(), PassThru, Mask, | |||
| 35171 | Gather->getBasePtr(), Index, Gather->getScale() }; | |||
| 35172 | SDValue Res = DAG.getMemIntrinsicNode( | |||
| 35173 | X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops, | |||
| 35174 | Gather->getMemoryVT(), Gather->getMemOperand()); | |||
| 35175 | Results.push_back(Res); | |||
| 35176 | Results.push_back(Res.getValue(1)); | |||
| 35177 | return; | |||
| 35178 | } | |||
| 35179 | return; | |||
| 35180 | } | |||
| 35181 | case ISD::LOAD: { | |||
| 35182 | // Use an f64/i64 load and a scalar_to_vector for v2f32/v2i32 loads. This | |||
| 35183 | // avoids scalarizing in 32-bit mode. In 64-bit mode this avoids a int->fp | |||
| 35184 | // cast since type legalization will try to use an i64 load. | |||
| 35185 | MVT VT = N->getSimpleValueType(0); | |||
| 35186 | assert(VT.isVector() && VT.getSizeInBits() == 64 && "Unexpected VT")(static_cast <bool> (VT.isVector() && VT.getSizeInBits () == 64 && "Unexpected VT") ? void (0) : __assert_fail ("VT.isVector() && VT.getSizeInBits() == 64 && \"Unexpected VT\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35186, __extension__ __PRETTY_FUNCTION__)); | |||
| 35187 | assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__ __PRETTY_FUNCTION__)) | |||
| 35188 | "Unexpected type action!")(static_cast <bool> (getTypeAction(*DAG.getContext(), VT ) == TypeWidenVector && "Unexpected type action!") ? void (0) : __assert_fail ("getTypeAction(*DAG.getContext(), VT) == TypeWidenVector && \"Unexpected type action!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35188, __extension__ __PRETTY_FUNCTION__)); | |||
| 35189 | if (!ISD::isNON_EXTLoad(N)) | |||
| 35190 | return; | |||
| 35191 | auto *Ld = cast<LoadSDNode>(N); | |||
| 35192 | if (Subtarget.hasSSE2()) { | |||
| 35193 | MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64; | |||
| 35194 | SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(), | |||
| 35195 | Ld->getPointerInfo(), Ld->getOriginalAlign(), | |||
| 35196 | Ld->getMemOperand()->getFlags()); | |||
| 35197 | SDValue Chain = Res.getValue(1); | |||
| 35198 | MVT VecVT = MVT::getVectorVT(LdVT, 2); | |||
| 35199 | Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Res); | |||
| 35200 | EVT WideVT = getTypeToTransformTo(*DAG.getContext(), VT); | |||
| 35201 | Res = DAG.getBitcast(WideVT, Res); | |||
| 35202 | Results.push_back(Res); | |||
| 35203 | Results.push_back(Chain); | |||
| 35204 | return; | |||
| 35205 | } | |||
| 35206 | assert(Subtarget.hasSSE1() && "Expected SSE")(static_cast <bool> (Subtarget.hasSSE1() && "Expected SSE" ) ? void (0) : __assert_fail ("Subtarget.hasSSE1() && \"Expected SSE\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35206, __extension__ __PRETTY_FUNCTION__)); | |||
| 35207 | SDVTList Tys = DAG.getVTList(MVT::v4f32, MVT::Other); | |||
| 35208 | SDValue Ops[] = {Ld->getChain(), Ld->getBasePtr()}; | |||
| 35209 | SDValue Res = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, | |||
| 35210 | MVT::i64, Ld->getMemOperand()); | |||
| 35211 | Results.push_back(Res); | |||
| 35212 | Results.push_back(Res.getValue(1)); | |||
| 35213 | return; | |||
| 35214 | } | |||
| 35215 | case ISD::ADDRSPACECAST: { | |||
| 35216 | SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG); | |||
| 35217 | Results.push_back(V); | |||
| 35218 | return; | |||
| 35219 | } | |||
| 35220 | case ISD::BITREVERSE: { | |||
| 35221 | assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!")(static_cast <bool> (N->getValueType(0) == MVT::i64 && "Unexpected VT!") ? void (0) : __assert_fail ("N->getValueType(0) == MVT::i64 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35221, __extension__ __PRETTY_FUNCTION__)); | |||
| 35222 | assert(Subtarget.hasXOP() && "Expected XOP")(static_cast <bool> (Subtarget.hasXOP() && "Expected XOP" ) ? void (0) : __assert_fail ("Subtarget.hasXOP() && \"Expected XOP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35222, __extension__ __PRETTY_FUNCTION__)); | |||
| 35223 | // We can use VPPERM by copying to a vector register and back. We'll need | |||
| 35224 | // to move the scalar in two i32 pieces. | |||
| 35225 | Results.push_back(LowerBITREVERSE(SDValue(N, 0), Subtarget, DAG)); | |||
| 35226 | return; | |||
| 35227 | } | |||
| 35228 | case ISD::EXTRACT_VECTOR_ELT: { | |||
| 35229 | // f16 = extract vXf16 %vec, i64 %idx | |||
| 35230 | assert(N->getSimpleValueType(0) == MVT::f16 &&(static_cast <bool> (N->getSimpleValueType(0) == MVT ::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!" ) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__ __PRETTY_FUNCTION__)) | |||
| 35231 | "Unexpected Value type of EXTRACT_VECTOR_ELT!")(static_cast <bool> (N->getSimpleValueType(0) == MVT ::f16 && "Unexpected Value type of EXTRACT_VECTOR_ELT!" ) ? void (0) : __assert_fail ("N->getSimpleValueType(0) == MVT::f16 && \"Unexpected Value type of EXTRACT_VECTOR_ELT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35231, __extension__ __PRETTY_FUNCTION__)); | |||
| 35232 | assert(Subtarget.hasFP16() && "Expected FP16")(static_cast <bool> (Subtarget.hasFP16() && "Expected FP16" ) ? void (0) : __assert_fail ("Subtarget.hasFP16() && \"Expected FP16\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35232, __extension__ __PRETTY_FUNCTION__)); | |||
| 35233 | SDValue VecOp = N->getOperand(0); | |||
| 35234 | EVT ExtVT = VecOp.getValueType().changeVectorElementTypeToInteger(); | |||
| 35235 | SDValue Split = DAG.getBitcast(ExtVT, N->getOperand(0)); | |||
| 35236 | Split = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Split, | |||
| 35237 | N->getOperand(1)); | |||
| 35238 | Split = DAG.getBitcast(MVT::f16, Split); | |||
| 35239 | Results.push_back(Split); | |||
| 35240 | return; | |||
| 35241 | } | |||
| 35242 | } | |||
| 35243 | } | |||
| 35244 | ||||
| 35245 | const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { | |||
| 35246 | switch ((X86ISD::NodeType)Opcode) { | |||
| 35247 | case X86ISD::FIRST_NUMBER: break; | |||
| 35248 | #define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE; | |||
| 35249 | NODE_NAME_CASE(BSF) | |||
| 35250 | NODE_NAME_CASE(BSR) | |||
| 35251 | NODE_NAME_CASE(FSHL) | |||
| 35252 | NODE_NAME_CASE(FSHR) | |||
| 35253 | NODE_NAME_CASE(FAND) | |||
| 35254 | NODE_NAME_CASE(FANDN) | |||
| 35255 | NODE_NAME_CASE(FOR) | |||
| 35256 | NODE_NAME_CASE(FXOR) | |||
| 35257 | NODE_NAME_CASE(FILD) | |||
| 35258 | NODE_NAME_CASE(FIST) | |||
| 35259 | NODE_NAME_CASE(FP_TO_INT_IN_MEM) | |||
| 35260 | NODE_NAME_CASE(FLD) | |||
| 35261 | NODE_NAME_CASE(FST) | |||
| 35262 | NODE_NAME_CASE(CALL) | |||
| 35263 | NODE_NAME_CASE(CALL_RVMARKER) | |||
| 35264 | NODE_NAME_CASE(BT) | |||
| 35265 | NODE_NAME_CASE(CMP) | |||
| 35266 | NODE_NAME_CASE(FCMP) | |||
| 35267 | NODE_NAME_CASE(STRICT_FCMP) | |||
| 35268 | NODE_NAME_CASE(STRICT_FCMPS) | |||
| 35269 | NODE_NAME_CASE(COMI) | |||
| 35270 | NODE_NAME_CASE(UCOMI) | |||
| 35271 | NODE_NAME_CASE(CMPM) | |||
| 35272 | NODE_NAME_CASE(CMPMM) | |||
| 35273 | NODE_NAME_CASE(STRICT_CMPM) | |||
| 35274 | NODE_NAME_CASE(CMPMM_SAE) | |||
| 35275 | NODE_NAME_CASE(SETCC) | |||
| 35276 | NODE_NAME_CASE(SETCC_CARRY) | |||
| 35277 | NODE_NAME_CASE(FSETCC) | |||
| 35278 | NODE_NAME_CASE(FSETCCM) | |||
| 35279 | NODE_NAME_CASE(FSETCCM_SAE) | |||
| 35280 | NODE_NAME_CASE(CMOV) | |||
| 35281 | NODE_NAME_CASE(BRCOND) | |||
| 35282 | NODE_NAME_CASE(RET_GLUE) | |||
| 35283 | NODE_NAME_CASE(IRET) | |||
| 35284 | NODE_NAME_CASE(REP_STOS) | |||
| 35285 | NODE_NAME_CASE(REP_MOVS) | |||
| 35286 | NODE_NAME_CASE(GlobalBaseReg) | |||
| 35287 | NODE_NAME_CASE(Wrapper) | |||
| 35288 | NODE_NAME_CASE(WrapperRIP) | |||
| 35289 | NODE_NAME_CASE(MOVQ2DQ) | |||
| 35290 | NODE_NAME_CASE(MOVDQ2Q) | |||
| 35291 | NODE_NAME_CASE(MMX_MOVD2W) | |||
| 35292 | NODE_NAME_CASE(MMX_MOVW2D) | |||
| 35293 | NODE_NAME_CASE(PEXTRB) | |||
| 35294 | NODE_NAME_CASE(PEXTRW) | |||
| 35295 | NODE_NAME_CASE(INSERTPS) | |||
| 35296 | NODE_NAME_CASE(PINSRB) | |||
| 35297 | NODE_NAME_CASE(PINSRW) | |||
| 35298 | NODE_NAME_CASE(PSHUFB) | |||
| 35299 | NODE_NAME_CASE(ANDNP) | |||
| 35300 | NODE_NAME_CASE(BLENDI) | |||
| 35301 | NODE_NAME_CASE(BLENDV) | |||
| 35302 | NODE_NAME_CASE(HADD) | |||
| 35303 | NODE_NAME_CASE(HSUB) | |||
| 35304 | NODE_NAME_CASE(FHADD) | |||
| 35305 | NODE_NAME_CASE(FHSUB) | |||
| 35306 | NODE_NAME_CASE(CONFLICT) | |||
| 35307 | NODE_NAME_CASE(FMAX) | |||
| 35308 | NODE_NAME_CASE(FMAXS) | |||
| 35309 | NODE_NAME_CASE(FMAX_SAE) | |||
| 35310 | NODE_NAME_CASE(FMAXS_SAE) | |||
| 35311 | NODE_NAME_CASE(FMIN) | |||
| 35312 | NODE_NAME_CASE(FMINS) | |||
| 35313 | NODE_NAME_CASE(FMIN_SAE) | |||
| 35314 | NODE_NAME_CASE(FMINS_SAE) | |||
| 35315 | NODE_NAME_CASE(FMAXC) | |||
| 35316 | NODE_NAME_CASE(FMINC) | |||
| 35317 | NODE_NAME_CASE(FRSQRT) | |||
| 35318 | NODE_NAME_CASE(FRCP) | |||
| 35319 | NODE_NAME_CASE(EXTRQI) | |||
| 35320 | NODE_NAME_CASE(INSERTQI) | |||
| 35321 | NODE_NAME_CASE(TLSADDR) | |||
| 35322 | NODE_NAME_CASE(TLSBASEADDR) | |||
| 35323 | NODE_NAME_CASE(TLSCALL) | |||
| 35324 | NODE_NAME_CASE(EH_SJLJ_SETJMP) | |||
| 35325 | NODE_NAME_CASE(EH_SJLJ_LONGJMP) | |||
| 35326 | NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH) | |||
| 35327 | NODE_NAME_CASE(EH_RETURN) | |||
| 35328 | NODE_NAME_CASE(TC_RETURN) | |||
| 35329 | NODE_NAME_CASE(FNSTCW16m) | |||
| 35330 | NODE_NAME_CASE(FLDCW16m) | |||
| 35331 | NODE_NAME_CASE(LCMPXCHG_DAG) | |||
| 35332 | NODE_NAME_CASE(LCMPXCHG8_DAG) | |||
| 35333 | NODE_NAME_CASE(LCMPXCHG16_DAG) | |||
| 35334 | NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG) | |||
| 35335 | NODE_NAME_CASE(LADD) | |||
| 35336 | NODE_NAME_CASE(LSUB) | |||
| 35337 | NODE_NAME_CASE(LOR) | |||
| 35338 | NODE_NAME_CASE(LXOR) | |||
| 35339 | NODE_NAME_CASE(LAND) | |||
| 35340 | NODE_NAME_CASE(LBTS) | |||
| 35341 | NODE_NAME_CASE(LBTC) | |||
| 35342 | NODE_NAME_CASE(LBTR) | |||
| 35343 | NODE_NAME_CASE(LBTS_RM) | |||
| 35344 | NODE_NAME_CASE(LBTC_RM) | |||
| 35345 | NODE_NAME_CASE(LBTR_RM) | |||
| 35346 | NODE_NAME_CASE(AADD) | |||
| 35347 | NODE_NAME_CASE(AOR) | |||
| 35348 | NODE_NAME_CASE(AXOR) | |||
| 35349 | NODE_NAME_CASE(AAND) | |||
| 35350 | NODE_NAME_CASE(VZEXT_MOVL) | |||
| 35351 | NODE_NAME_CASE(VZEXT_LOAD) | |||
| 35352 | NODE_NAME_CASE(VEXTRACT_STORE) | |||
| 35353 | NODE_NAME_CASE(VTRUNC) | |||
| 35354 | NODE_NAME_CASE(VTRUNCS) | |||
| 35355 | NODE_NAME_CASE(VTRUNCUS) | |||
| 35356 | NODE_NAME_CASE(VMTRUNC) | |||
| 35357 | NODE_NAME_CASE(VMTRUNCS) | |||
| 35358 | NODE_NAME_CASE(VMTRUNCUS) | |||
| 35359 | NODE_NAME_CASE(VTRUNCSTORES) | |||
| 35360 | NODE_NAME_CASE(VTRUNCSTOREUS) | |||
| 35361 | NODE_NAME_CASE(VMTRUNCSTORES) | |||
| 35362 | NODE_NAME_CASE(VMTRUNCSTOREUS) | |||
| 35363 | NODE_NAME_CASE(VFPEXT) | |||
| 35364 | NODE_NAME_CASE(STRICT_VFPEXT) | |||
| 35365 | NODE_NAME_CASE(VFPEXT_SAE) | |||
| 35366 | NODE_NAME_CASE(VFPEXTS) | |||
| 35367 | NODE_NAME_CASE(VFPEXTS_SAE) | |||
| 35368 | NODE_NAME_CASE(VFPROUND) | |||
| 35369 | NODE_NAME_CASE(STRICT_VFPROUND) | |||
| 35370 | NODE_NAME_CASE(VMFPROUND) | |||
| 35371 | NODE_NAME_CASE(VFPROUND_RND) | |||
| 35372 | NODE_NAME_CASE(VFPROUNDS) | |||
| 35373 | NODE_NAME_CASE(VFPROUNDS_RND) | |||
| 35374 | NODE_NAME_CASE(VSHLDQ) | |||
| 35375 | NODE_NAME_CASE(VSRLDQ) | |||
| 35376 | NODE_NAME_CASE(VSHL) | |||
| 35377 | NODE_NAME_CASE(VSRL) | |||
| 35378 | NODE_NAME_CASE(VSRA) | |||
| 35379 | NODE_NAME_CASE(VSHLI) | |||
| 35380 | NODE_NAME_CASE(VSRLI) | |||
| 35381 | NODE_NAME_CASE(VSRAI) | |||
| 35382 | NODE_NAME_CASE(VSHLV) | |||
| 35383 | NODE_NAME_CASE(VSRLV) | |||
| 35384 | NODE_NAME_CASE(VSRAV) | |||
| 35385 | NODE_NAME_CASE(VROTLI) | |||
| 35386 | NODE_NAME_CASE(VROTRI) | |||
| 35387 | NODE_NAME_CASE(VPPERM) | |||
| 35388 | NODE_NAME_CASE(CMPP) | |||
| 35389 | NODE_NAME_CASE(STRICT_CMPP) | |||
| 35390 | NODE_NAME_CASE(PCMPEQ) | |||
| 35391 | NODE_NAME_CASE(PCMPGT) | |||
| 35392 | NODE_NAME_CASE(PHMINPOS) | |||
| 35393 | NODE_NAME_CASE(ADD) | |||
| 35394 | NODE_NAME_CASE(SUB) | |||
| 35395 | NODE_NAME_CASE(ADC) | |||
| 35396 | NODE_NAME_CASE(SBB) | |||
| 35397 | NODE_NAME_CASE(SMUL) | |||
| 35398 | NODE_NAME_CASE(UMUL) | |||
| 35399 | NODE_NAME_CASE(OR) | |||
| 35400 | NODE_NAME_CASE(XOR) | |||
| 35401 | NODE_NAME_CASE(AND) | |||
| 35402 | NODE_NAME_CASE(BEXTR) | |||
| 35403 | NODE_NAME_CASE(BEXTRI) | |||
| 35404 | NODE_NAME_CASE(BZHI) | |||
| 35405 | NODE_NAME_CASE(PDEP) | |||
| 35406 | NODE_NAME_CASE(PEXT) | |||
| 35407 | NODE_NAME_CASE(MUL_IMM) | |||
| 35408 | NODE_NAME_CASE(MOVMSK) | |||
| 35409 | NODE_NAME_CASE(PTEST) | |||
| 35410 | NODE_NAME_CASE(TESTP) | |||
| 35411 | NODE_NAME_CASE(KORTEST) | |||
| 35412 | NODE_NAME_CASE(KTEST) | |||
| 35413 | NODE_NAME_CASE(KADD) | |||
| 35414 | NODE_NAME_CASE(KSHIFTL) | |||
| 35415 | NODE_NAME_CASE(KSHIFTR) | |||
| 35416 | NODE_NAME_CASE(PACKSS) | |||
| 35417 | NODE_NAME_CASE(PACKUS) | |||
| 35418 | NODE_NAME_CASE(PALIGNR) | |||
| 35419 | NODE_NAME_CASE(VALIGN) | |||
| 35420 | NODE_NAME_CASE(VSHLD) | |||
| 35421 | NODE_NAME_CASE(VSHRD) | |||
| 35422 | NODE_NAME_CASE(VSHLDV) | |||
| 35423 | NODE_NAME_CASE(VSHRDV) | |||
| 35424 | NODE_NAME_CASE(PSHUFD) | |||
| 35425 | NODE_NAME_CASE(PSHUFHW) | |||
| 35426 | NODE_NAME_CASE(PSHUFLW) | |||
| 35427 | NODE_NAME_CASE(SHUFP) | |||
| 35428 | NODE_NAME_CASE(SHUF128) | |||
| 35429 | NODE_NAME_CASE(MOVLHPS) | |||
| 35430 | NODE_NAME_CASE(MOVHLPS) | |||
| 35431 | NODE_NAME_CASE(MOVDDUP) | |||
| 35432 | NODE_NAME_CASE(MOVSHDUP) | |||
| 35433 | NODE_NAME_CASE(MOVSLDUP) | |||
| 35434 | NODE_NAME_CASE(MOVSD) | |||
| 35435 | NODE_NAME_CASE(MOVSS) | |||
| 35436 | NODE_NAME_CASE(MOVSH) | |||
| 35437 | NODE_NAME_CASE(UNPCKL) | |||
| 35438 | NODE_NAME_CASE(UNPCKH) | |||
| 35439 | NODE_NAME_CASE(VBROADCAST) | |||
| 35440 | NODE_NAME_CASE(VBROADCAST_LOAD) | |||
| 35441 | NODE_NAME_CASE(VBROADCASTM) | |||
| 35442 | NODE_NAME_CASE(SUBV_BROADCAST_LOAD) | |||
| 35443 | NODE_NAME_CASE(VPERMILPV) | |||
| 35444 | NODE_NAME_CASE(VPERMILPI) | |||
| 35445 | NODE_NAME_CASE(VPERM2X128) | |||
| 35446 | NODE_NAME_CASE(VPERMV) | |||
| 35447 | NODE_NAME_CASE(VPERMV3) | |||
| 35448 | NODE_NAME_CASE(VPERMI) | |||
| 35449 | NODE_NAME_CASE(VPTERNLOG) | |||
| 35450 | NODE_NAME_CASE(VFIXUPIMM) | |||
| 35451 | NODE_NAME_CASE(VFIXUPIMM_SAE) | |||
| 35452 | NODE_NAME_CASE(VFIXUPIMMS) | |||
| 35453 | NODE_NAME_CASE(VFIXUPIMMS_SAE) | |||
| 35454 | NODE_NAME_CASE(VRANGE) | |||
| 35455 | NODE_NAME_CASE(VRANGE_SAE) | |||
| 35456 | NODE_NAME_CASE(VRANGES) | |||
| 35457 | NODE_NAME_CASE(VRANGES_SAE) | |||
| 35458 | NODE_NAME_CASE(PMULUDQ) | |||
| 35459 | NODE_NAME_CASE(PMULDQ) | |||
| 35460 | NODE_NAME_CASE(PSADBW) | |||
| 35461 | NODE_NAME_CASE(DBPSADBW) | |||
| 35462 | NODE_NAME_CASE(VASTART_SAVE_XMM_REGS) | |||
| 35463 | NODE_NAME_CASE(VAARG_64) | |||
| 35464 | NODE_NAME_CASE(VAARG_X32) | |||
| 35465 | NODE_NAME_CASE(DYN_ALLOCA) | |||
| 35466 | NODE_NAME_CASE(MFENCE) | |||
| 35467 | NODE_NAME_CASE(SEG_ALLOCA) | |||
| 35468 | NODE_NAME_CASE(PROBED_ALLOCA) | |||
| 35469 | NODE_NAME_CASE(RDRAND) | |||
| 35470 | NODE_NAME_CASE(RDSEED) | |||
| 35471 | NODE_NAME_CASE(RDPKRU) | |||
| 35472 | NODE_NAME_CASE(WRPKRU) | |||
| 35473 | NODE_NAME_CASE(VPMADDUBSW) | |||
| 35474 | NODE_NAME_CASE(VPMADDWD) | |||
| 35475 | NODE_NAME_CASE(VPSHA) | |||
| 35476 | NODE_NAME_CASE(VPSHL) | |||
| 35477 | NODE_NAME_CASE(VPCOM) | |||
| 35478 | NODE_NAME_CASE(VPCOMU) | |||
| 35479 | NODE_NAME_CASE(VPERMIL2) | |||
| 35480 | NODE_NAME_CASE(FMSUB) | |||
| 35481 | NODE_NAME_CASE(STRICT_FMSUB) | |||
| 35482 | NODE_NAME_CASE(FNMADD) | |||
| 35483 | NODE_NAME_CASE(STRICT_FNMADD) | |||
| 35484 | NODE_NAME_CASE(FNMSUB) | |||
| 35485 | NODE_NAME_CASE(STRICT_FNMSUB) | |||
| 35486 | NODE_NAME_CASE(FMADDSUB) | |||
| 35487 | NODE_NAME_CASE(FMSUBADD) | |||
| 35488 | NODE_NAME_CASE(FMADD_RND) | |||
| 35489 | NODE_NAME_CASE(FNMADD_RND) | |||
| 35490 | NODE_NAME_CASE(FMSUB_RND) | |||
| 35491 | NODE_NAME_CASE(FNMSUB_RND) | |||
| 35492 | NODE_NAME_CASE(FMADDSUB_RND) | |||
| 35493 | NODE_NAME_CASE(FMSUBADD_RND) | |||
| 35494 | NODE_NAME_CASE(VFMADDC) | |||
| 35495 | NODE_NAME_CASE(VFMADDC_RND) | |||
| 35496 | NODE_NAME_CASE(VFCMADDC) | |||
| 35497 | NODE_NAME_CASE(VFCMADDC_RND) | |||
| 35498 | NODE_NAME_CASE(VFMULC) | |||
| 35499 | NODE_NAME_CASE(VFMULC_RND) | |||
| 35500 | NODE_NAME_CASE(VFCMULC) | |||
| 35501 | NODE_NAME_CASE(VFCMULC_RND) | |||
| 35502 | NODE_NAME_CASE(VFMULCSH) | |||
| 35503 | NODE_NAME_CASE(VFMULCSH_RND) | |||
| 35504 | NODE_NAME_CASE(VFCMULCSH) | |||
| 35505 | NODE_NAME_CASE(VFCMULCSH_RND) | |||
| 35506 | NODE_NAME_CASE(VFMADDCSH) | |||
| 35507 | NODE_NAME_CASE(VFMADDCSH_RND) | |||
| 35508 | NODE_NAME_CASE(VFCMADDCSH) | |||
| 35509 | NODE_NAME_CASE(VFCMADDCSH_RND) | |||
| 35510 | NODE_NAME_CASE(VPMADD52H) | |||
| 35511 | NODE_NAME_CASE(VPMADD52L) | |||
| 35512 | NODE_NAME_CASE(VRNDSCALE) | |||
| 35513 | NODE_NAME_CASE(STRICT_VRNDSCALE) | |||
| 35514 | NODE_NAME_CASE(VRNDSCALE_SAE) | |||
| 35515 | NODE_NAME_CASE(VRNDSCALES) | |||
| 35516 | NODE_NAME_CASE(VRNDSCALES_SAE) | |||
| 35517 | NODE_NAME_CASE(VREDUCE) | |||
| 35518 | NODE_NAME_CASE(VREDUCE_SAE) | |||
| 35519 | NODE_NAME_CASE(VREDUCES) | |||
| 35520 | NODE_NAME_CASE(VREDUCES_SAE) | |||
| 35521 | NODE_NAME_CASE(VGETMANT) | |||
| 35522 | NODE_NAME_CASE(VGETMANT_SAE) | |||
| 35523 | NODE_NAME_CASE(VGETMANTS) | |||
| 35524 | NODE_NAME_CASE(VGETMANTS_SAE) | |||
| 35525 | NODE_NAME_CASE(PCMPESTR) | |||
| 35526 | NODE_NAME_CASE(PCMPISTR) | |||
| 35527 | NODE_NAME_CASE(XTEST) | |||
| 35528 | NODE_NAME_CASE(COMPRESS) | |||
| 35529 | NODE_NAME_CASE(EXPAND) | |||
| 35530 | NODE_NAME_CASE(SELECTS) | |||
| 35531 | NODE_NAME_CASE(ADDSUB) | |||
| 35532 | NODE_NAME_CASE(RCP14) | |||
| 35533 | NODE_NAME_CASE(RCP14S) | |||
| 35534 | NODE_NAME_CASE(RCP28) | |||
| 35535 | NODE_NAME_CASE(RCP28_SAE) | |||
| 35536 | NODE_NAME_CASE(RCP28S) | |||
| 35537 | NODE_NAME_CASE(RCP28S_SAE) | |||
| 35538 | NODE_NAME_CASE(EXP2) | |||
| 35539 | NODE_NAME_CASE(EXP2_SAE) | |||
| 35540 | NODE_NAME_CASE(RSQRT14) | |||
| 35541 | NODE_NAME_CASE(RSQRT14S) | |||
| 35542 | NODE_NAME_CASE(RSQRT28) | |||
| 35543 | NODE_NAME_CASE(RSQRT28_SAE) | |||
| 35544 | NODE_NAME_CASE(RSQRT28S) | |||
| 35545 | NODE_NAME_CASE(RSQRT28S_SAE) | |||
| 35546 | NODE_NAME_CASE(FADD_RND) | |||
| 35547 | NODE_NAME_CASE(FADDS) | |||
| 35548 | NODE_NAME_CASE(FADDS_RND) | |||
| 35549 | NODE_NAME_CASE(FSUB_RND) | |||
| 35550 | NODE_NAME_CASE(FSUBS) | |||
| 35551 | NODE_NAME_CASE(FSUBS_RND) | |||
| 35552 | NODE_NAME_CASE(FMUL_RND) | |||
| 35553 | NODE_NAME_CASE(FMULS) | |||
| 35554 | NODE_NAME_CASE(FMULS_RND) | |||
| 35555 | NODE_NAME_CASE(FDIV_RND) | |||
| 35556 | NODE_NAME_CASE(FDIVS) | |||
| 35557 | NODE_NAME_CASE(FDIVS_RND) | |||
| 35558 | NODE_NAME_CASE(FSQRT_RND) | |||
| 35559 | NODE_NAME_CASE(FSQRTS) | |||
| 35560 | NODE_NAME_CASE(FSQRTS_RND) | |||
| 35561 | NODE_NAME_CASE(FGETEXP) | |||
| 35562 | NODE_NAME_CASE(FGETEXP_SAE) | |||
| 35563 | NODE_NAME_CASE(FGETEXPS) | |||
| 35564 | NODE_NAME_CASE(FGETEXPS_SAE) | |||
| 35565 | NODE_NAME_CASE(SCALEF) | |||
| 35566 | NODE_NAME_CASE(SCALEF_RND) | |||
| 35567 | NODE_NAME_CASE(SCALEFS) | |||
| 35568 | NODE_NAME_CASE(SCALEFS_RND) | |||
| 35569 | NODE_NAME_CASE(MULHRS) | |||
| 35570 | NODE_NAME_CASE(SINT_TO_FP_RND) | |||
| 35571 | NODE_NAME_CASE(UINT_TO_FP_RND) | |||
| 35572 | NODE_NAME_CASE(CVTTP2SI) | |||
| 35573 | NODE_NAME_CASE(CVTTP2UI) | |||
| 35574 | NODE_NAME_CASE(STRICT_CVTTP2SI) | |||
| 35575 | NODE_NAME_CASE(STRICT_CVTTP2UI) | |||
| 35576 | NODE_NAME_CASE(MCVTTP2SI) | |||
| 35577 | NODE_NAME_CASE(MCVTTP2UI) | |||
| 35578 | NODE_NAME_CASE(CVTTP2SI_SAE) | |||
| 35579 | NODE_NAME_CASE(CVTTP2UI_SAE) | |||
| 35580 | NODE_NAME_CASE(CVTTS2SI) | |||
| 35581 | NODE_NAME_CASE(CVTTS2UI) | |||
| 35582 | NODE_NAME_CASE(CVTTS2SI_SAE) | |||
| 35583 | NODE_NAME_CASE(CVTTS2UI_SAE) | |||
| 35584 | NODE_NAME_CASE(CVTSI2P) | |||
| 35585 | NODE_NAME_CASE(CVTUI2P) | |||
| 35586 | NODE_NAME_CASE(STRICT_CVTSI2P) | |||
| 35587 | NODE_NAME_CASE(STRICT_CVTUI2P) | |||
| 35588 | NODE_NAME_CASE(MCVTSI2P) | |||
| 35589 | NODE_NAME_CASE(MCVTUI2P) | |||
| 35590 | NODE_NAME_CASE(VFPCLASS) | |||
| 35591 | NODE_NAME_CASE(VFPCLASSS) | |||
| 35592 | NODE_NAME_CASE(MULTISHIFT) | |||
| 35593 | NODE_NAME_CASE(SCALAR_SINT_TO_FP) | |||
| 35594 | NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND) | |||
| 35595 | NODE_NAME_CASE(SCALAR_UINT_TO_FP) | |||
| 35596 | NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND) | |||
| 35597 | NODE_NAME_CASE(CVTPS2PH) | |||
| 35598 | NODE_NAME_CASE(STRICT_CVTPS2PH) | |||
| 35599 | NODE_NAME_CASE(CVTPS2PH_SAE) | |||
| 35600 | NODE_NAME_CASE(MCVTPS2PH) | |||
| 35601 | NODE_NAME_CASE(MCVTPS2PH_SAE) | |||
| 35602 | NODE_NAME_CASE(CVTPH2PS) | |||
| 35603 | NODE_NAME_CASE(STRICT_CVTPH2PS) | |||
| 35604 | NODE_NAME_CASE(CVTPH2PS_SAE) | |||
| 35605 | NODE_NAME_CASE(CVTP2SI) | |||
| 35606 | NODE_NAME_CASE(CVTP2UI) | |||
| 35607 | NODE_NAME_CASE(MCVTP2SI) | |||
| 35608 | NODE_NAME_CASE(MCVTP2UI) | |||
| 35609 | NODE_NAME_CASE(CVTP2SI_RND) | |||
| 35610 | NODE_NAME_CASE(CVTP2UI_RND) | |||
| 35611 | NODE_NAME_CASE(CVTS2SI) | |||
| 35612 | NODE_NAME_CASE(CVTS2UI) | |||
| 35613 | NODE_NAME_CASE(CVTS2SI_RND) | |||
| 35614 | NODE_NAME_CASE(CVTS2UI_RND) | |||
| 35615 | NODE_NAME_CASE(CVTNE2PS2BF16) | |||
| 35616 | NODE_NAME_CASE(CVTNEPS2BF16) | |||
| 35617 | NODE_NAME_CASE(MCVTNEPS2BF16) | |||
| 35618 | NODE_NAME_CASE(DPBF16PS) | |||
| 35619 | NODE_NAME_CASE(LWPINS) | |||
| 35620 | NODE_NAME_CASE(MGATHER) | |||
| 35621 | NODE_NAME_CASE(MSCATTER) | |||
| 35622 | NODE_NAME_CASE(VPDPBUSD) | |||
| 35623 | NODE_NAME_CASE(VPDPBUSDS) | |||
| 35624 | NODE_NAME_CASE(VPDPWSSD) | |||
| 35625 | NODE_NAME_CASE(VPDPWSSDS) | |||
| 35626 | NODE_NAME_CASE(VPSHUFBITQMB) | |||
| 35627 | NODE_NAME_CASE(GF2P8MULB) | |||
| 35628 | NODE_NAME_CASE(GF2P8AFFINEQB) | |||
| 35629 | NODE_NAME_CASE(GF2P8AFFINEINVQB) | |||
| 35630 | NODE_NAME_CASE(NT_CALL) | |||
| 35631 | NODE_NAME_CASE(NT_BRIND) | |||
| 35632 | NODE_NAME_CASE(UMWAIT) | |||
| 35633 | NODE_NAME_CASE(TPAUSE) | |||
| 35634 | NODE_NAME_CASE(ENQCMD) | |||
| 35635 | NODE_NAME_CASE(ENQCMDS) | |||
| 35636 | NODE_NAME_CASE(VP2INTERSECT) | |||
| 35637 | NODE_NAME_CASE(VPDPBSUD) | |||
| 35638 | NODE_NAME_CASE(VPDPBSUDS) | |||
| 35639 | NODE_NAME_CASE(VPDPBUUD) | |||
| 35640 | NODE_NAME_CASE(VPDPBUUDS) | |||
| 35641 | NODE_NAME_CASE(VPDPBSSD) | |||
| 35642 | NODE_NAME_CASE(VPDPBSSDS) | |||
| 35643 | NODE_NAME_CASE(AESENC128KL) | |||
| 35644 | NODE_NAME_CASE(AESDEC128KL) | |||
| 35645 | NODE_NAME_CASE(AESENC256KL) | |||
| 35646 | NODE_NAME_CASE(AESDEC256KL) | |||
| 35647 | NODE_NAME_CASE(AESENCWIDE128KL) | |||
| 35648 | NODE_NAME_CASE(AESDECWIDE128KL) | |||
| 35649 | NODE_NAME_CASE(AESENCWIDE256KL) | |||
| 35650 | NODE_NAME_CASE(AESDECWIDE256KL) | |||
| 35651 | NODE_NAME_CASE(CMPCCXADD) | |||
| 35652 | NODE_NAME_CASE(TESTUI) | |||
| 35653 | NODE_NAME_CASE(FP80_ADD) | |||
| 35654 | NODE_NAME_CASE(STRICT_FP80_ADD) | |||
| 35655 | } | |||
| 35656 | return nullptr; | |||
| 35657 | #undef NODE_NAME_CASE | |||
| 35658 | } | |||
| 35659 | ||||
| 35660 | /// Return true if the addressing mode represented by AM is legal for this | |||
| 35661 | /// target, for a load/store of the specified type. | |||
| 35662 | bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, | |||
| 35663 | const AddrMode &AM, Type *Ty, | |||
| 35664 | unsigned AS, | |||
| 35665 | Instruction *I) const { | |||
| 35666 | // X86 supports extremely general addressing modes. | |||
| 35667 | CodeModel::Model M = getTargetMachine().getCodeModel(); | |||
| 35668 | ||||
| 35669 | // X86 allows a sign-extended 32-bit immediate field as a displacement. | |||
| 35670 | if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr)) | |||
| 35671 | return false; | |||
| 35672 | ||||
| 35673 | if (AM.BaseGV) { | |||
| 35674 | unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV); | |||
| 35675 | ||||
| 35676 | // If a reference to this global requires an extra load, we can't fold it. | |||
| 35677 | if (isGlobalStubReference(GVFlags)) | |||
| 35678 | return false; | |||
| 35679 | ||||
| 35680 | // If BaseGV requires a register for the PIC base, we cannot also have a | |||
| 35681 | // BaseReg specified. | |||
| 35682 | if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags)) | |||
| 35683 | return false; | |||
| 35684 | ||||
| 35685 | // If lower 4G is not available, then we must use rip-relative addressing. | |||
| 35686 | if ((M != CodeModel::Small || isPositionIndependent()) && | |||
| 35687 | Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1)) | |||
| 35688 | return false; | |||
| 35689 | } | |||
| 35690 | ||||
| 35691 | switch (AM.Scale) { | |||
| 35692 | case 0: | |||
| 35693 | case 1: | |||
| 35694 | case 2: | |||
| 35695 | case 4: | |||
| 35696 | case 8: | |||
| 35697 | // These scales always work. | |||
| 35698 | break; | |||
| 35699 | case 3: | |||
| 35700 | case 5: | |||
| 35701 | case 9: | |||
| 35702 | // These scales are formed with basereg+scalereg. Only accept if there is | |||
| 35703 | // no basereg yet. | |||
| 35704 | if (AM.HasBaseReg) | |||
| 35705 | return false; | |||
| 35706 | break; | |||
| 35707 | default: // Other stuff never works. | |||
| 35708 | return false; | |||
| 35709 | } | |||
| 35710 | ||||
| 35711 | return true; | |||
| 35712 | } | |||
| 35713 | ||||
| 35714 | bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { | |||
| 35715 | unsigned Bits = Ty->getScalarSizeInBits(); | |||
| 35716 | ||||
| 35717 | // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts. | |||
| 35718 | // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred. | |||
| 35719 | if (Subtarget.hasXOP() && | |||
| 35720 | (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64)) | |||
| 35721 | return false; | |||
| 35722 | ||||
| 35723 | // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable | |||
| 35724 | // shifts just as cheap as scalar ones. | |||
| 35725 | if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) | |||
| 35726 | return false; | |||
| 35727 | ||||
| 35728 | // AVX512BW has shifts such as vpsllvw. | |||
| 35729 | if (Subtarget.hasBWI() && Bits == 16) | |||
| 35730 | return false; | |||
| 35731 | ||||
| 35732 | // Otherwise, it's significantly cheaper to shift by a scalar amount than by a | |||
| 35733 | // fully general vector. | |||
| 35734 | return true; | |||
| 35735 | } | |||
| 35736 | ||||
| 35737 | bool X86TargetLowering::isBinOp(unsigned Opcode) const { | |||
| 35738 | switch (Opcode) { | |||
| 35739 | // These are non-commutative binops. | |||
| 35740 | // TODO: Add more X86ISD opcodes once we have test coverage. | |||
| 35741 | case X86ISD::ANDNP: | |||
| 35742 | case X86ISD::PCMPGT: | |||
| 35743 | case X86ISD::FMAX: | |||
| 35744 | case X86ISD::FMIN: | |||
| 35745 | case X86ISD::FANDN: | |||
| 35746 | case X86ISD::VPSHA: | |||
| 35747 | case X86ISD::VPSHL: | |||
| 35748 | case X86ISD::VSHLV: | |||
| 35749 | case X86ISD::VSRLV: | |||
| 35750 | case X86ISD::VSRAV: | |||
| 35751 | return true; | |||
| 35752 | } | |||
| 35753 | ||||
| 35754 | return TargetLoweringBase::isBinOp(Opcode); | |||
| 35755 | } | |||
| 35756 | ||||
| 35757 | bool X86TargetLowering::isCommutativeBinOp(unsigned Opcode) const { | |||
| 35758 | switch (Opcode) { | |||
| 35759 | // TODO: Add more X86ISD opcodes once we have test coverage. | |||
| 35760 | case X86ISD::PCMPEQ: | |||
| 35761 | case X86ISD::PMULDQ: | |||
| 35762 | case X86ISD::PMULUDQ: | |||
| 35763 | case X86ISD::FMAXC: | |||
| 35764 | case X86ISD::FMINC: | |||
| 35765 | case X86ISD::FAND: | |||
| 35766 | case X86ISD::FOR: | |||
| 35767 | case X86ISD::FXOR: | |||
| 35768 | return true; | |||
| 35769 | } | |||
| 35770 | ||||
| 35771 | return TargetLoweringBase::isCommutativeBinOp(Opcode); | |||
| 35772 | } | |||
| 35773 | ||||
| 35774 | bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const { | |||
| 35775 | if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) | |||
| 35776 | return false; | |||
| 35777 | unsigned NumBits1 = Ty1->getPrimitiveSizeInBits(); | |||
| 35778 | unsigned NumBits2 = Ty2->getPrimitiveSizeInBits(); | |||
| 35779 | return NumBits1 > NumBits2; | |||
| 35780 | } | |||
| 35781 | ||||
| 35782 | bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const { | |||
| 35783 | if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy()) | |||
| 35784 | return false; | |||
| 35785 | ||||
| 35786 | if (!isTypeLegal(EVT::getEVT(Ty1))) | |||
| 35787 | return false; | |||
| 35788 | ||||
| 35789 | assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop")(static_cast <bool> (Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop") ? void (0) : __assert_fail ("Ty1->getPrimitiveSizeInBits() <= 64 && \"i128 is probably not a noop\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 35789, __extension__ __PRETTY_FUNCTION__)); | |||
| 35790 | ||||
| 35791 | // Assuming the caller doesn't have a zeroext or signext return parameter, | |||
| 35792 | // truncation all the way down to i1 is valid. | |||
| 35793 | return true; | |||
| 35794 | } | |||
| 35795 | ||||
| 35796 | bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const { | |||
| 35797 | return isInt<32>(Imm); | |||
| 35798 | } | |||
| 35799 | ||||
| 35800 | bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const { | |||
| 35801 | // Can also use sub to handle negated immediates. | |||
| 35802 | return isInt<32>(Imm); | |||
| 35803 | } | |||
| 35804 | ||||
| 35805 | bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const { | |||
| 35806 | return isInt<32>(Imm); | |||
| 35807 | } | |||
| 35808 | ||||
| 35809 | bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const { | |||
| 35810 | if (!VT1.isScalarInteger() || !VT2.isScalarInteger()) | |||
| 35811 | return false; | |||
| 35812 | unsigned NumBits1 = VT1.getSizeInBits(); | |||
| 35813 | unsigned NumBits2 = VT2.getSizeInBits(); | |||
| 35814 | return NumBits1 > NumBits2; | |||
| 35815 | } | |||
| 35816 | ||||
| 35817 | bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const { | |||
| 35818 | // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. | |||
| 35819 | return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit(); | |||
| 35820 | } | |||
| 35821 | ||||
| 35822 | bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const { | |||
| 35823 | // x86-64 implicitly zero-extends 32-bit results in 64-bit registers. | |||
| 35824 | return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit(); | |||
| 35825 | } | |||
| 35826 | ||||
| 35827 | bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const { | |||
| 35828 | EVT VT1 = Val.getValueType(); | |||
| 35829 | if (isZExtFree(VT1, VT2)) | |||
| 35830 | return true; | |||
| 35831 | ||||
| 35832 | if (Val.getOpcode() != ISD::LOAD) | |||
| 35833 | return false; | |||
| 35834 | ||||
| 35835 | if (!VT1.isSimple() || !VT1.isInteger() || | |||
| 35836 | !VT2.isSimple() || !VT2.isInteger()) | |||
| 35837 | return false; | |||
| 35838 | ||||
| 35839 | switch (VT1.getSimpleVT().SimpleTy) { | |||
| 35840 | default: break; | |||
| 35841 | case MVT::i8: | |||
| 35842 | case MVT::i16: | |||
| 35843 | case MVT::i32: | |||
| 35844 | // X86 has 8, 16, and 32-bit zero-extending loads. | |||
| 35845 | return true; | |||
| 35846 | } | |||
| 35847 | ||||
| 35848 | return false; | |||
| 35849 | } | |||
| 35850 | ||||
| 35851 | bool X86TargetLowering::shouldSinkOperands(Instruction *I, | |||
| 35852 | SmallVectorImpl<Use *> &Ops) const { | |||
| 35853 | using namespace llvm::PatternMatch; | |||
| 35854 | ||||
| 35855 | FixedVectorType *VTy = dyn_cast<FixedVectorType>(I->getType()); | |||
| 35856 | if (!VTy) | |||
| 35857 | return false; | |||
| 35858 | ||||
| 35859 | if (I->getOpcode() == Instruction::Mul && | |||
| 35860 | VTy->getElementType()->isIntegerTy(64)) { | |||
| 35861 | for (auto &Op : I->operands()) { | |||
| 35862 | // Make sure we are not already sinking this operand | |||
| 35863 | if (any_of(Ops, [&](Use *U) { return U->get() == Op; })) | |||
| 35864 | continue; | |||
| 35865 | ||||
| 35866 | // Look for PMULDQ pattern where the input is a sext_inreg from vXi32 or | |||
| 35867 | // the PMULUDQ pattern where the input is a zext_inreg from vXi32. | |||
| 35868 | if (Subtarget.hasSSE41() && | |||
| 35869 | match(Op.get(), m_AShr(m_Shl(m_Value(), m_SpecificInt(32)), | |||
| 35870 | m_SpecificInt(32)))) { | |||
| 35871 | Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0)); | |||
| 35872 | Ops.push_back(&Op); | |||
| 35873 | } else if (Subtarget.hasSSE2() && | |||
| 35874 | match(Op.get(), | |||
| 35875 | m_And(m_Value(), m_SpecificInt(UINT64_C(0xffffffff)0xffffffffUL)))) { | |||
| 35876 | Ops.push_back(&Op); | |||
| 35877 | } | |||
| 35878 | } | |||
| 35879 | ||||
| 35880 | return !Ops.empty(); | |||
| 35881 | } | |||
| 35882 | ||||
| 35883 | // A uniform shift amount in a vector shift or funnel shift may be much | |||
| 35884 | // cheaper than a generic variable vector shift, so make that pattern visible | |||
| 35885 | // to SDAG by sinking the shuffle instruction next to the shift. | |||
| 35886 | int ShiftAmountOpNum = -1; | |||
| 35887 | if (I->isShift()) | |||
| 35888 | ShiftAmountOpNum = 1; | |||
| 35889 | else if (auto *II = dyn_cast<IntrinsicInst>(I)) { | |||
| 35890 | if (II->getIntrinsicID() == Intrinsic::fshl || | |||
| 35891 | II->getIntrinsicID() == Intrinsic::fshr) | |||
| 35892 | ShiftAmountOpNum = 2; | |||
| 35893 | } | |||
| 35894 | ||||
| 35895 | if (ShiftAmountOpNum == -1) | |||
| 35896 | return false; | |||
| 35897 | ||||
| 35898 | auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum)); | |||
| 35899 | if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 && | |||
| 35900 | isVectorShiftByScalarCheap(I->getType())) { | |||
| 35901 | Ops.push_back(&I->getOperandUse(ShiftAmountOpNum)); | |||
| 35902 | return true; | |||
| 35903 | } | |||
| 35904 | ||||
| 35905 | return false; | |||
| 35906 | } | |||
| 35907 | ||||
| 35908 | bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const { | |||
| 35909 | if (!Subtarget.is64Bit()) | |||
| 35910 | return false; | |||
| 35911 | return TargetLowering::shouldConvertPhiType(From, To); | |||
| 35912 | } | |||
| 35913 | ||||
| 35914 | bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { | |||
| 35915 | if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0))) | |||
| 35916 | return false; | |||
| 35917 | ||||
| 35918 | EVT SrcVT = ExtVal.getOperand(0).getValueType(); | |||
| 35919 | ||||
| 35920 | // There is no extending load for vXi1. | |||
| 35921 | if (SrcVT.getScalarType() == MVT::i1) | |||
| 35922 | return false; | |||
| 35923 | ||||
| 35924 | return true; | |||
| 35925 | } | |||
| 35926 | ||||
| 35927 | bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, | |||
| 35928 | EVT VT) const { | |||
| 35929 | if (!Subtarget.hasAnyFMA()) | |||
| 35930 | return false; | |||
| 35931 | ||||
| 35932 | VT = VT.getScalarType(); | |||
| 35933 | ||||
| 35934 | if (!VT.isSimple()) | |||
| 35935 | return false; | |||
| 35936 | ||||
| 35937 | switch (VT.getSimpleVT().SimpleTy) { | |||
| 35938 | case MVT::f16: | |||
| 35939 | return Subtarget.hasFP16(); | |||
| 35940 | case MVT::f32: | |||
| 35941 | case MVT::f64: | |||
| 35942 | return true; | |||
| 35943 | default: | |||
| 35944 | break; | |||
| 35945 | } | |||
| 35946 | ||||
| 35947 | return false; | |||
| 35948 | } | |||
| 35949 | ||||
| 35950 | bool X86TargetLowering::isNarrowingProfitable(EVT SrcVT, EVT DestVT) const { | |||
| 35951 | // i16 instructions are longer (0x66 prefix) and potentially slower. | |||
| 35952 | return !(SrcVT == MVT::i32 && DestVT == MVT::i16); | |||
| 35953 | } | |||
| 35954 | ||||
| 35955 | bool X86TargetLowering::shouldFoldSelectWithIdentityConstant(unsigned Opcode, | |||
| 35956 | EVT VT) const { | |||
| 35957 | // TODO: This is too general. There are cases where pre-AVX512 codegen would | |||
| 35958 | // benefit. The transform may also be profitable for scalar code. | |||
| 35959 | if (!Subtarget.hasAVX512()) | |||
| 35960 | return false; | |||
| 35961 | if (!Subtarget.hasVLX() && !VT.is512BitVector()) | |||
| 35962 | return false; | |||
| 35963 | if (!VT.isVector() || VT.getScalarType() == MVT::i1) | |||
| 35964 | return false; | |||
| 35965 | ||||
| 35966 | return true; | |||
| 35967 | } | |||
| 35968 | ||||
| 35969 | /// Targets can use this to indicate that they only support *some* | |||
| 35970 | /// VECTOR_SHUFFLE operations, those with specific masks. | |||
| 35971 | /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values | |||
| 35972 | /// are assumed to be legal. | |||
| 35973 | bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const { | |||
| 35974 | if (!VT.isSimple()) | |||
| 35975 | return false; | |||
| 35976 | ||||
| 35977 | // Not for i1 vectors | |||
| 35978 | if (VT.getSimpleVT().getScalarType() == MVT::i1) | |||
| 35979 | return false; | |||
| 35980 | ||||
| 35981 | // Very little shuffling can be done for 64-bit vectors right now. | |||
| 35982 | if (VT.getSimpleVT().getSizeInBits() == 64) | |||
| 35983 | return false; | |||
| 35984 | ||||
| 35985 | // We only care that the types being shuffled are legal. The lowering can | |||
| 35986 | // handle any possible shuffle mask that results. | |||
| 35987 | return isTypeLegal(VT.getSimpleVT()); | |||
| 35988 | } | |||
| 35989 | ||||
| 35990 | bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask, | |||
| 35991 | EVT VT) const { | |||
| 35992 | // Don't convert an 'and' into a shuffle that we don't directly support. | |||
| 35993 | // vpblendw and vpshufb for 256-bit vectors are not available on AVX1. | |||
| 35994 | if (!Subtarget.hasAVX2()) | |||
| 35995 | if (VT == MVT::v32i8 || VT == MVT::v16i16) | |||
| 35996 | return false; | |||
| 35997 | ||||
| 35998 | // Just delegate to the generic legality, clear masks aren't special. | |||
| 35999 | return isShuffleMaskLegal(Mask, VT); | |||
| 36000 | } | |||
| 36001 | ||||
| 36002 | bool X86TargetLowering::areJTsAllowed(const Function *Fn) const { | |||
| 36003 | // If the subtarget is using thunks, we need to not generate jump tables. | |||
| 36004 | if (Subtarget.useIndirectThunkBranches()) | |||
| 36005 | return false; | |||
| 36006 | ||||
| 36007 | // Otherwise, fallback on the generic logic. | |||
| 36008 | return TargetLowering::areJTsAllowed(Fn); | |||
| 36009 | } | |||
| 36010 | ||||
| 36011 | MVT X86TargetLowering::getPreferredSwitchConditionType(LLVMContext &Context, | |||
| 36012 | EVT ConditionVT) const { | |||
| 36013 | // Avoid 8 and 16 bit types because they increase the chance for unnecessary | |||
| 36014 | // zero-extensions. | |||
| 36015 | if (ConditionVT.getSizeInBits() < 32) | |||
| 36016 | return MVT::i32; | |||
| 36017 | return TargetLoweringBase::getPreferredSwitchConditionType(Context, | |||
| 36018 | ConditionVT); | |||
| 36019 | } | |||
| 36020 | ||||
| 36021 | //===----------------------------------------------------------------------===// | |||
| 36022 | // X86 Scheduler Hooks | |||
| 36023 | //===----------------------------------------------------------------------===// | |||
| 36024 | ||||
| 36025 | // Returns true if EFLAG is consumed after this iterator in the rest of the | |||
| 36026 | // basic block or any successors of the basic block. | |||
| 36027 | static bool isEFLAGSLiveAfter(MachineBasicBlock::iterator Itr, | |||
| 36028 | MachineBasicBlock *BB) { | |||
| 36029 | // Scan forward through BB for a use/def of EFLAGS. | |||
| 36030 | for (const MachineInstr &mi : llvm::make_range(std::next(Itr), BB->end())) { | |||
| 36031 | if (mi.readsRegister(X86::EFLAGS)) | |||
| 36032 | return true; | |||
| 36033 | // If we found a def, we can stop searching. | |||
| 36034 | if (mi.definesRegister(X86::EFLAGS)) | |||
| 36035 | return false; | |||
| 36036 | } | |||
| 36037 | ||||
| 36038 | // If we hit the end of the block, check whether EFLAGS is live into a | |||
| 36039 | // successor. | |||
| 36040 | for (MachineBasicBlock *Succ : BB->successors()) | |||
| 36041 | if (Succ->isLiveIn(X86::EFLAGS)) | |||
| 36042 | return true; | |||
| 36043 | ||||
| 36044 | return false; | |||
| 36045 | } | |||
| 36046 | ||||
| 36047 | /// Utility function to emit xbegin specifying the start of an RTM region. | |||
| 36048 | static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB, | |||
| 36049 | const TargetInstrInfo *TII) { | |||
| 36050 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 36051 | ||||
| 36052 | const BasicBlock *BB = MBB->getBasicBlock(); | |||
| 36053 | MachineFunction::iterator I = ++MBB->getIterator(); | |||
| 36054 | ||||
| 36055 | // For the v = xbegin(), we generate | |||
| 36056 | // | |||
| 36057 | // thisMBB: | |||
| 36058 | // xbegin sinkMBB | |||
| 36059 | // | |||
| 36060 | // mainMBB: | |||
| 36061 | // s0 = -1 | |||
| 36062 | // | |||
| 36063 | // fallBB: | |||
| 36064 | // eax = # XABORT_DEF | |||
| 36065 | // s1 = eax | |||
| 36066 | // | |||
| 36067 | // sinkMBB: | |||
| 36068 | // v = phi(s0/mainBB, s1/fallBB) | |||
| 36069 | ||||
| 36070 | MachineBasicBlock *thisMBB = MBB; | |||
| 36071 | MachineFunction *MF = MBB->getParent(); | |||
| 36072 | MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); | |||
| 36073 | MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); | |||
| 36074 | MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); | |||
| 36075 | MF->insert(I, mainMBB); | |||
| 36076 | MF->insert(I, fallMBB); | |||
| 36077 | MF->insert(I, sinkMBB); | |||
| 36078 | ||||
| 36079 | if (isEFLAGSLiveAfter(MI, MBB)) { | |||
| 36080 | mainMBB->addLiveIn(X86::EFLAGS); | |||
| 36081 | fallMBB->addLiveIn(X86::EFLAGS); | |||
| 36082 | sinkMBB->addLiveIn(X86::EFLAGS); | |||
| 36083 | } | |||
| 36084 | ||||
| 36085 | // Transfer the remainder of BB and its successor edges to sinkMBB. | |||
| 36086 | sinkMBB->splice(sinkMBB->begin(), MBB, | |||
| 36087 | std::next(MachineBasicBlock::iterator(MI)), MBB->end()); | |||
| 36088 | sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); | |||
| 36089 | ||||
| 36090 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 36091 | Register DstReg = MI.getOperand(0).getReg(); | |||
| 36092 | const TargetRegisterClass *RC = MRI.getRegClass(DstReg); | |||
| 36093 | Register mainDstReg = MRI.createVirtualRegister(RC); | |||
| 36094 | Register fallDstReg = MRI.createVirtualRegister(RC); | |||
| 36095 | ||||
| 36096 | // thisMBB: | |||
| 36097 | // xbegin fallMBB | |||
| 36098 | // # fallthrough to mainMBB | |||
| 36099 | // # abortion to fallMBB | |||
| 36100 | BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB); | |||
| 36101 | thisMBB->addSuccessor(mainMBB); | |||
| 36102 | thisMBB->addSuccessor(fallMBB); | |||
| 36103 | ||||
| 36104 | // mainMBB: | |||
| 36105 | // mainDstReg := -1 | |||
| 36106 | BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1); | |||
| 36107 | BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); | |||
| 36108 | mainMBB->addSuccessor(sinkMBB); | |||
| 36109 | ||||
| 36110 | // fallMBB: | |||
| 36111 | // ; pseudo instruction to model hardware's definition from XABORT | |||
| 36112 | // EAX := XABORT_DEF | |||
| 36113 | // fallDstReg := EAX | |||
| 36114 | BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF)); | |||
| 36115 | BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg) | |||
| 36116 | .addReg(X86::EAX); | |||
| 36117 | fallMBB->addSuccessor(sinkMBB); | |||
| 36118 | ||||
| 36119 | // sinkMBB: | |||
| 36120 | // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB) | |||
| 36121 | BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg) | |||
| 36122 | .addReg(mainDstReg).addMBB(mainMBB) | |||
| 36123 | .addReg(fallDstReg).addMBB(fallMBB); | |||
| 36124 | ||||
| 36125 | MI.eraseFromParent(); | |||
| 36126 | return sinkMBB; | |||
| 36127 | } | |||
| 36128 | ||||
| 36129 | MachineBasicBlock * | |||
| 36130 | X86TargetLowering::EmitVAARGWithCustomInserter(MachineInstr &MI, | |||
| 36131 | MachineBasicBlock *MBB) const { | |||
| 36132 | // Emit va_arg instruction on X86-64. | |||
| 36133 | ||||
| 36134 | // Operands to this pseudo-instruction: | |||
| 36135 | // 0 ) Output : destination address (reg) | |||
| 36136 | // 1-5) Input : va_list address (addr, i64mem) | |||
| 36137 | // 6 ) ArgSize : Size (in bytes) of vararg type | |||
| 36138 | // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset | |||
| 36139 | // 8 ) Align : Alignment of type | |||
| 36140 | // 9 ) EFLAGS (implicit-def) | |||
| 36141 | ||||
| 36142 | assert(MI.getNumOperands() == 10 && "VAARG should have 10 operands!")(static_cast <bool> (MI.getNumOperands() == 10 && "VAARG should have 10 operands!") ? void (0) : __assert_fail ("MI.getNumOperands() == 10 && \"VAARG should have 10 operands!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 36142, __extension__ __PRETTY_FUNCTION__)); | |||
| 36143 | static_assert(X86::AddrNumOperands == 5, "VAARG assumes 5 address operands"); | |||
| 36144 | ||||
| 36145 | Register DestReg = MI.getOperand(0).getReg(); | |||
| 36146 | MachineOperand &Base = MI.getOperand(1); | |||
| 36147 | MachineOperand &Scale = MI.getOperand(2); | |||
| 36148 | MachineOperand &Index = MI.getOperand(3); | |||
| 36149 | MachineOperand &Disp = MI.getOperand(4); | |||
| 36150 | MachineOperand &Segment = MI.getOperand(5); | |||
| 36151 | unsigned ArgSize = MI.getOperand(6).getImm(); | |||
| 36152 | unsigned ArgMode = MI.getOperand(7).getImm(); | |||
| 36153 | Align Alignment = Align(MI.getOperand(8).getImm()); | |||
| 36154 | ||||
| 36155 | MachineFunction *MF = MBB->getParent(); | |||
| 36156 | ||||
| 36157 | // Memory Reference | |||
| 36158 | assert(MI.hasOneMemOperand() && "Expected VAARG to have one memoperand")(static_cast <bool> (MI.hasOneMemOperand() && "Expected VAARG to have one memoperand" ) ? void (0) : __assert_fail ("MI.hasOneMemOperand() && \"Expected VAARG to have one memoperand\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 36158, __extension__ __PRETTY_FUNCTION__)); | |||
| 36159 | ||||
| 36160 | MachineMemOperand *OldMMO = MI.memoperands().front(); | |||
| 36161 | ||||
| 36162 | // Clone the MMO into two separate MMOs for loading and storing | |||
| 36163 | MachineMemOperand *LoadOnlyMMO = MF->getMachineMemOperand( | |||
| 36164 | OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOStore); | |||
| 36165 | MachineMemOperand *StoreOnlyMMO = MF->getMachineMemOperand( | |||
| 36166 | OldMMO, OldMMO->getFlags() & ~MachineMemOperand::MOLoad); | |||
| 36167 | ||||
| 36168 | // Machine Information | |||
| 36169 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 36170 | MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); | |||
| 36171 | const TargetRegisterClass *AddrRegClass = | |||
| 36172 | getRegClassFor(getPointerTy(MBB->getParent()->getDataLayout())); | |||
| 36173 | const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32); | |||
| 36174 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 36175 | ||||
| 36176 | // struct va_list { | |||
| 36177 | // i32 gp_offset | |||
| 36178 | // i32 fp_offset | |||
| 36179 | // i64 overflow_area (address) | |||
| 36180 | // i64 reg_save_area (address) | |||
| 36181 | // } | |||
| 36182 | // sizeof(va_list) = 24 | |||
| 36183 | // alignment(va_list) = 8 | |||
| 36184 | ||||
| 36185 | unsigned TotalNumIntRegs = 6; | |||
| 36186 | unsigned TotalNumXMMRegs = 8; | |||
| 36187 | bool UseGPOffset = (ArgMode == 1); | |||
| 36188 | bool UseFPOffset = (ArgMode == 2); | |||
| 36189 | unsigned MaxOffset = TotalNumIntRegs * 8 + | |||
| 36190 | (UseFPOffset ? TotalNumXMMRegs * 16 : 0); | |||
| 36191 | ||||
| 36192 | /* Align ArgSize to a multiple of 8 */ | |||
| 36193 | unsigned ArgSizeA8 = (ArgSize + 7) & ~7; | |||
| 36194 | bool NeedsAlign = (Alignment > 8); | |||
| 36195 | ||||
| 36196 | MachineBasicBlock *thisMBB = MBB; | |||
| 36197 | MachineBasicBlock *overflowMBB; | |||
| 36198 | MachineBasicBlock *offsetMBB; | |||
| 36199 | MachineBasicBlock *endMBB; | |||
| 36200 | ||||
| 36201 | unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB | |||
| 36202 | unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB | |||
| 36203 | unsigned OffsetReg = 0; | |||
| 36204 | ||||
| 36205 | if (!UseGPOffset && !UseFPOffset) { | |||
| 36206 | // If we only pull from the overflow region, we don't create a branch. | |||
| 36207 | // We don't need to alter control flow. | |||
| 36208 | OffsetDestReg = 0; // unused | |||
| 36209 | OverflowDestReg = DestReg; | |||
| 36210 | ||||
| 36211 | offsetMBB = nullptr; | |||
| 36212 | overflowMBB = thisMBB; | |||
| 36213 | endMBB = thisMBB; | |||
| 36214 | } else { | |||
| 36215 | // First emit code to check if gp_offset (or fp_offset) is below the bound. | |||
| 36216 | // If so, pull the argument from reg_save_area. (branch to offsetMBB) | |||
| 36217 | // If not, pull from overflow_area. (branch to overflowMBB) | |||
| 36218 | // | |||
| 36219 | // thisMBB | |||
| 36220 | // | . | |||
| 36221 | // | . | |||
| 36222 | // offsetMBB overflowMBB | |||
| 36223 | // | . | |||
| 36224 | // | . | |||
| 36225 | // endMBB | |||
| 36226 | ||||
| 36227 | // Registers for the PHI in endMBB | |||
| 36228 | OffsetDestReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36229 | OverflowDestReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36230 | ||||
| 36231 | const BasicBlock *LLVM_BB = MBB->getBasicBlock(); | |||
| 36232 | overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36233 | offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36234 | endMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36235 | ||||
| 36236 | MachineFunction::iterator MBBIter = ++MBB->getIterator(); | |||
| 36237 | ||||
| 36238 | // Insert the new basic blocks | |||
| 36239 | MF->insert(MBBIter, offsetMBB); | |||
| 36240 | MF->insert(MBBIter, overflowMBB); | |||
| 36241 | MF->insert(MBBIter, endMBB); | |||
| 36242 | ||||
| 36243 | // Transfer the remainder of MBB and its successor edges to endMBB. | |||
| 36244 | endMBB->splice(endMBB->begin(), thisMBB, | |||
| 36245 | std::next(MachineBasicBlock::iterator(MI)), thisMBB->end()); | |||
| 36246 | endMBB->transferSuccessorsAndUpdatePHIs(thisMBB); | |||
| 36247 | ||||
| 36248 | // Make offsetMBB and overflowMBB successors of thisMBB | |||
| 36249 | thisMBB->addSuccessor(offsetMBB); | |||
| 36250 | thisMBB->addSuccessor(overflowMBB); | |||
| 36251 | ||||
| 36252 | // endMBB is a successor of both offsetMBB and overflowMBB | |||
| 36253 | offsetMBB->addSuccessor(endMBB); | |||
| 36254 | overflowMBB->addSuccessor(endMBB); | |||
| 36255 | ||||
| 36256 | // Load the offset value into a register | |||
| 36257 | OffsetReg = MRI.createVirtualRegister(OffsetRegClass); | |||
| 36258 | BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg) | |||
| 36259 | .add(Base) | |||
| 36260 | .add(Scale) | |||
| 36261 | .add(Index) | |||
| 36262 | .addDisp(Disp, UseFPOffset ? 4 : 0) | |||
| 36263 | .add(Segment) | |||
| 36264 | .setMemRefs(LoadOnlyMMO); | |||
| 36265 | ||||
| 36266 | // Check if there is enough room left to pull this argument. | |||
| 36267 | BuildMI(thisMBB, DL, TII->get(X86::CMP32ri)) | |||
| 36268 | .addReg(OffsetReg) | |||
| 36269 | .addImm(MaxOffset + 8 - ArgSizeA8); | |||
| 36270 | ||||
| 36271 | // Branch to "overflowMBB" if offset >= max | |||
| 36272 | // Fall through to "offsetMBB" otherwise | |||
| 36273 | BuildMI(thisMBB, DL, TII->get(X86::JCC_1)) | |||
| 36274 | .addMBB(overflowMBB).addImm(X86::COND_AE); | |||
| 36275 | } | |||
| 36276 | ||||
| 36277 | // In offsetMBB, emit code to use the reg_save_area. | |||
| 36278 | if (offsetMBB) { | |||
| 36279 | assert(OffsetReg != 0)(static_cast <bool> (OffsetReg != 0) ? void (0) : __assert_fail ("OffsetReg != 0", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 36279, __extension__ __PRETTY_FUNCTION__)); | |||
| 36280 | ||||
| 36281 | // Read the reg_save_area address. | |||
| 36282 | Register RegSaveReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36283 | BuildMI( | |||
| 36284 | offsetMBB, DL, | |||
| 36285 | TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), | |||
| 36286 | RegSaveReg) | |||
| 36287 | .add(Base) | |||
| 36288 | .add(Scale) | |||
| 36289 | .add(Index) | |||
| 36290 | .addDisp(Disp, Subtarget.isTarget64BitLP64() ? 16 : 12) | |||
| 36291 | .add(Segment) | |||
| 36292 | .setMemRefs(LoadOnlyMMO); | |||
| 36293 | ||||
| 36294 | if (Subtarget.isTarget64BitLP64()) { | |||
| 36295 | // Zero-extend the offset | |||
| 36296 | Register OffsetReg64 = MRI.createVirtualRegister(AddrRegClass); | |||
| 36297 | BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64) | |||
| 36298 | .addImm(0) | |||
| 36299 | .addReg(OffsetReg) | |||
| 36300 | .addImm(X86::sub_32bit); | |||
| 36301 | ||||
| 36302 | // Add the offset to the reg_save_area to get the final address. | |||
| 36303 | BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg) | |||
| 36304 | .addReg(OffsetReg64) | |||
| 36305 | .addReg(RegSaveReg); | |||
| 36306 | } else { | |||
| 36307 | // Add the offset to the reg_save_area to get the final address. | |||
| 36308 | BuildMI(offsetMBB, DL, TII->get(X86::ADD32rr), OffsetDestReg) | |||
| 36309 | .addReg(OffsetReg) | |||
| 36310 | .addReg(RegSaveReg); | |||
| 36311 | } | |||
| 36312 | ||||
| 36313 | // Compute the offset for the next argument | |||
| 36314 | Register NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass); | |||
| 36315 | BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg) | |||
| 36316 | .addReg(OffsetReg) | |||
| 36317 | .addImm(UseFPOffset ? 16 : 8); | |||
| 36318 | ||||
| 36319 | // Store it back into the va_list. | |||
| 36320 | BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr)) | |||
| 36321 | .add(Base) | |||
| 36322 | .add(Scale) | |||
| 36323 | .add(Index) | |||
| 36324 | .addDisp(Disp, UseFPOffset ? 4 : 0) | |||
| 36325 | .add(Segment) | |||
| 36326 | .addReg(NextOffsetReg) | |||
| 36327 | .setMemRefs(StoreOnlyMMO); | |||
| 36328 | ||||
| 36329 | // Jump to endMBB | |||
| 36330 | BuildMI(offsetMBB, DL, TII->get(X86::JMP_1)) | |||
| 36331 | .addMBB(endMBB); | |||
| 36332 | } | |||
| 36333 | ||||
| 36334 | // | |||
| 36335 | // Emit code to use overflow area | |||
| 36336 | // | |||
| 36337 | ||||
| 36338 | // Load the overflow_area address into a register. | |||
| 36339 | Register OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36340 | BuildMI(overflowMBB, DL, | |||
| 36341 | TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64rm : X86::MOV32rm), | |||
| 36342 | OverflowAddrReg) | |||
| 36343 | .add(Base) | |||
| 36344 | .add(Scale) | |||
| 36345 | .add(Index) | |||
| 36346 | .addDisp(Disp, 8) | |||
| 36347 | .add(Segment) | |||
| 36348 | .setMemRefs(LoadOnlyMMO); | |||
| 36349 | ||||
| 36350 | // If we need to align it, do so. Otherwise, just copy the address | |||
| 36351 | // to OverflowDestReg. | |||
| 36352 | if (NeedsAlign) { | |||
| 36353 | // Align the overflow address | |||
| 36354 | Register TmpReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36355 | ||||
| 36356 | // aligned_addr = (addr + (align-1)) & ~(align-1) | |||
| 36357 | BuildMI( | |||
| 36358 | overflowMBB, DL, | |||
| 36359 | TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), | |||
| 36360 | TmpReg) | |||
| 36361 | .addReg(OverflowAddrReg) | |||
| 36362 | .addImm(Alignment.value() - 1); | |||
| 36363 | ||||
| 36364 | BuildMI( | |||
| 36365 | overflowMBB, DL, | |||
| 36366 | TII->get(Subtarget.isTarget64BitLP64() ? X86::AND64ri32 : X86::AND32ri), | |||
| 36367 | OverflowDestReg) | |||
| 36368 | .addReg(TmpReg) | |||
| 36369 | .addImm(~(uint64_t)(Alignment.value() - 1)); | |||
| 36370 | } else { | |||
| 36371 | BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg) | |||
| 36372 | .addReg(OverflowAddrReg); | |||
| 36373 | } | |||
| 36374 | ||||
| 36375 | // Compute the next overflow address after this argument. | |||
| 36376 | // (the overflow address should be kept 8-byte aligned) | |||
| 36377 | Register NextAddrReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 36378 | BuildMI( | |||
| 36379 | overflowMBB, DL, | |||
| 36380 | TII->get(Subtarget.isTarget64BitLP64() ? X86::ADD64ri32 : X86::ADD32ri), | |||
| 36381 | NextAddrReg) | |||
| 36382 | .addReg(OverflowDestReg) | |||
| 36383 | .addImm(ArgSizeA8); | |||
| 36384 | ||||
| 36385 | // Store the new overflow address. | |||
| 36386 | BuildMI(overflowMBB, DL, | |||
| 36387 | TII->get(Subtarget.isTarget64BitLP64() ? X86::MOV64mr : X86::MOV32mr)) | |||
| 36388 | .add(Base) | |||
| 36389 | .add(Scale) | |||
| 36390 | .add(Index) | |||
| 36391 | .addDisp(Disp, 8) | |||
| 36392 | .add(Segment) | |||
| 36393 | .addReg(NextAddrReg) | |||
| 36394 | .setMemRefs(StoreOnlyMMO); | |||
| 36395 | ||||
| 36396 | // If we branched, emit the PHI to the front of endMBB. | |||
| 36397 | if (offsetMBB) { | |||
| 36398 | BuildMI(*endMBB, endMBB->begin(), DL, | |||
| 36399 | TII->get(X86::PHI), DestReg) | |||
| 36400 | .addReg(OffsetDestReg).addMBB(offsetMBB) | |||
| 36401 | .addReg(OverflowDestReg).addMBB(overflowMBB); | |||
| 36402 | } | |||
| 36403 | ||||
| 36404 | // Erase the pseudo instruction | |||
| 36405 | MI.eraseFromParent(); | |||
| 36406 | ||||
| 36407 | return endMBB; | |||
| 36408 | } | |||
| 36409 | ||||
| 36410 | // The EFLAGS operand of SelectItr might be missing a kill marker | |||
| 36411 | // because there were multiple uses of EFLAGS, and ISel didn't know | |||
| 36412 | // which to mark. Figure out whether SelectItr should have had a | |||
| 36413 | // kill marker, and set it if it should. Returns the correct kill | |||
| 36414 | // marker value. | |||
| 36415 | static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr, | |||
| 36416 | MachineBasicBlock* BB, | |||
| 36417 | const TargetRegisterInfo* TRI) { | |||
| 36418 | if (isEFLAGSLiveAfter(SelectItr, BB)) | |||
| 36419 | return false; | |||
| 36420 | ||||
| 36421 | // We found a def, or hit the end of the basic block and EFLAGS wasn't live | |||
| 36422 | // out. SelectMI should have a kill flag on EFLAGS. | |||
| 36423 | SelectItr->addRegisterKilled(X86::EFLAGS, TRI); | |||
| 36424 | return true; | |||
| 36425 | } | |||
| 36426 | ||||
| 36427 | // Return true if it is OK for this CMOV pseudo-opcode to be cascaded | |||
| 36428 | // together with other CMOV pseudo-opcodes into a single basic-block with | |||
| 36429 | // conditional jump around it. | |||
| 36430 | static bool isCMOVPseudo(MachineInstr &MI) { | |||
| 36431 | switch (MI.getOpcode()) { | |||
| 36432 | case X86::CMOV_FR16: | |||
| 36433 | case X86::CMOV_FR16X: | |||
| 36434 | case X86::CMOV_FR32: | |||
| 36435 | case X86::CMOV_FR32X: | |||
| 36436 | case X86::CMOV_FR64: | |||
| 36437 | case X86::CMOV_FR64X: | |||
| 36438 | case X86::CMOV_GR8: | |||
| 36439 | case X86::CMOV_GR16: | |||
| 36440 | case X86::CMOV_GR32: | |||
| 36441 | case X86::CMOV_RFP32: | |||
| 36442 | case X86::CMOV_RFP64: | |||
| 36443 | case X86::CMOV_RFP80: | |||
| 36444 | case X86::CMOV_VR64: | |||
| 36445 | case X86::CMOV_VR128: | |||
| 36446 | case X86::CMOV_VR128X: | |||
| 36447 | case X86::CMOV_VR256: | |||
| 36448 | case X86::CMOV_VR256X: | |||
| 36449 | case X86::CMOV_VR512: | |||
| 36450 | case X86::CMOV_VK1: | |||
| 36451 | case X86::CMOV_VK2: | |||
| 36452 | case X86::CMOV_VK4: | |||
| 36453 | case X86::CMOV_VK8: | |||
| 36454 | case X86::CMOV_VK16: | |||
| 36455 | case X86::CMOV_VK32: | |||
| 36456 | case X86::CMOV_VK64: | |||
| 36457 | return true; | |||
| 36458 | ||||
| 36459 | default: | |||
| 36460 | return false; | |||
| 36461 | } | |||
| 36462 | } | |||
| 36463 | ||||
| 36464 | // Helper function, which inserts PHI functions into SinkMBB: | |||
| 36465 | // %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], | |||
| 36466 | // where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs | |||
| 36467 | // in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for | |||
| 36468 | // the last PHI function inserted. | |||
| 36469 | static MachineInstrBuilder createPHIsForCMOVsInSinkBB( | |||
| 36470 | MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, | |||
| 36471 | MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, | |||
| 36472 | MachineBasicBlock *SinkMBB) { | |||
| 36473 | MachineFunction *MF = TrueMBB->getParent(); | |||
| 36474 | const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); | |||
| 36475 | const DebugLoc &DL = MIItBegin->getDebugLoc(); | |||
| 36476 | ||||
| 36477 | X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm()); | |||
| 36478 | X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); | |||
| 36479 | ||||
| 36480 | MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); | |||
| 36481 | ||||
| 36482 | // As we are creating the PHIs, we have to be careful if there is more than | |||
| 36483 | // one. Later CMOVs may reference the results of earlier CMOVs, but later | |||
| 36484 | // PHIs have to reference the individual true/false inputs from earlier PHIs. | |||
| 36485 | // That also means that PHI construction must work forward from earlier to | |||
| 36486 | // later, and that the code must maintain a mapping from earlier PHI's | |||
| 36487 | // destination registers, and the registers that went into the PHI. | |||
| 36488 | DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; | |||
| 36489 | MachineInstrBuilder MIB; | |||
| 36490 | ||||
| 36491 | for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { | |||
| 36492 | Register DestReg = MIIt->getOperand(0).getReg(); | |||
| 36493 | Register Op1Reg = MIIt->getOperand(1).getReg(); | |||
| 36494 | Register Op2Reg = MIIt->getOperand(2).getReg(); | |||
| 36495 | ||||
| 36496 | // If this CMOV we are generating is the opposite condition from | |||
| 36497 | // the jump we generated, then we have to swap the operands for the | |||
| 36498 | // PHI that is going to be generated. | |||
| 36499 | if (MIIt->getOperand(3).getImm() == OppCC) | |||
| 36500 | std::swap(Op1Reg, Op2Reg); | |||
| 36501 | ||||
| 36502 | if (RegRewriteTable.contains(Op1Reg)) | |||
| 36503 | Op1Reg = RegRewriteTable[Op1Reg].first; | |||
| 36504 | ||||
| 36505 | if (RegRewriteTable.contains(Op2Reg)) | |||
| 36506 | Op2Reg = RegRewriteTable[Op2Reg].second; | |||
| 36507 | ||||
| 36508 | MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) | |||
| 36509 | .addReg(Op1Reg) | |||
| 36510 | .addMBB(FalseMBB) | |||
| 36511 | .addReg(Op2Reg) | |||
| 36512 | .addMBB(TrueMBB); | |||
| 36513 | ||||
| 36514 | // Add this PHI to the rewrite table. | |||
| 36515 | RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); | |||
| 36516 | } | |||
| 36517 | ||||
| 36518 | return MIB; | |||
| 36519 | } | |||
| 36520 | ||||
| 36521 | // Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2). | |||
| 36522 | MachineBasicBlock * | |||
| 36523 | X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, | |||
| 36524 | MachineInstr &SecondCascadedCMOV, | |||
| 36525 | MachineBasicBlock *ThisMBB) const { | |||
| 36526 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 36527 | const DebugLoc &DL = FirstCMOV.getDebugLoc(); | |||
| 36528 | ||||
| 36529 | // We lower cascaded CMOVs such as | |||
| 36530 | // | |||
| 36531 | // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2) | |||
| 36532 | // | |||
| 36533 | // to two successive branches. | |||
| 36534 | // | |||
| 36535 | // Without this, we would add a PHI between the two jumps, which ends up | |||
| 36536 | // creating a few copies all around. For instance, for | |||
| 36537 | // | |||
| 36538 | // (sitofp (zext (fcmp une))) | |||
| 36539 | // | |||
| 36540 | // we would generate: | |||
| 36541 | // | |||
| 36542 | // ucomiss %xmm1, %xmm0 | |||
| 36543 | // movss <1.0f>, %xmm0 | |||
| 36544 | // movaps %xmm0, %xmm1 | |||
| 36545 | // jne .LBB5_2 | |||
| 36546 | // xorps %xmm1, %xmm1 | |||
| 36547 | // .LBB5_2: | |||
| 36548 | // jp .LBB5_4 | |||
| 36549 | // movaps %xmm1, %xmm0 | |||
| 36550 | // .LBB5_4: | |||
| 36551 | // retq | |||
| 36552 | // | |||
| 36553 | // because this custom-inserter would have generated: | |||
| 36554 | // | |||
| 36555 | // A | |||
| 36556 | // | \ | |||
| 36557 | // | B | |||
| 36558 | // | / | |||
| 36559 | // C | |||
| 36560 | // | \ | |||
| 36561 | // | D | |||
| 36562 | // | / | |||
| 36563 | // E | |||
| 36564 | // | |||
| 36565 | // A: X = ...; Y = ... | |||
| 36566 | // B: empty | |||
| 36567 | // C: Z = PHI [X, A], [Y, B] | |||
| 36568 | // D: empty | |||
| 36569 | // E: PHI [X, C], [Z, D] | |||
| 36570 | // | |||
| 36571 | // If we lower both CMOVs in a single step, we can instead generate: | |||
| 36572 | // | |||
| 36573 | // A | |||
| 36574 | // | \ | |||
| 36575 | // | C | |||
| 36576 | // | /| | |||
| 36577 | // |/ | | |||
| 36578 | // | | | |||
| 36579 | // | D | |||
| 36580 | // | / | |||
| 36581 | // E | |||
| 36582 | // | |||
| 36583 | // A: X = ...; Y = ... | |||
| 36584 | // D: empty | |||
| 36585 | // E: PHI [X, A], [X, C], [Y, D] | |||
| 36586 | // | |||
| 36587 | // Which, in our sitofp/fcmp example, gives us something like: | |||
| 36588 | // | |||
| 36589 | // ucomiss %xmm1, %xmm0 | |||
| 36590 | // movss <1.0f>, %xmm0 | |||
| 36591 | // jne .LBB5_4 | |||
| 36592 | // jp .LBB5_4 | |||
| 36593 | // xorps %xmm0, %xmm0 | |||
| 36594 | // .LBB5_4: | |||
| 36595 | // retq | |||
| 36596 | // | |||
| 36597 | ||||
| 36598 | // We lower cascaded CMOV into two successive branches to the same block. | |||
| 36599 | // EFLAGS is used by both, so mark it as live in the second. | |||
| 36600 | const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); | |||
| 36601 | MachineFunction *F = ThisMBB->getParent(); | |||
| 36602 | MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); | |||
| 36603 | MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); | |||
| 36604 | MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); | |||
| 36605 | ||||
| 36606 | MachineFunction::iterator It = ++ThisMBB->getIterator(); | |||
| 36607 | F->insert(It, FirstInsertedMBB); | |||
| 36608 | F->insert(It, SecondInsertedMBB); | |||
| 36609 | F->insert(It, SinkMBB); | |||
| 36610 | ||||
| 36611 | // For a cascaded CMOV, we lower it to two successive branches to | |||
| 36612 | // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in | |||
| 36613 | // the FirstInsertedMBB. | |||
| 36614 | FirstInsertedMBB->addLiveIn(X86::EFLAGS); | |||
| 36615 | ||||
| 36616 | // If the EFLAGS register isn't dead in the terminator, then claim that it's | |||
| 36617 | // live into the sink and copy blocks. | |||
| 36618 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 36619 | if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) && | |||
| 36620 | !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) { | |||
| 36621 | SecondInsertedMBB->addLiveIn(X86::EFLAGS); | |||
| 36622 | SinkMBB->addLiveIn(X86::EFLAGS); | |||
| 36623 | } | |||
| 36624 | ||||
| 36625 | // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. | |||
| 36626 | SinkMBB->splice(SinkMBB->begin(), ThisMBB, | |||
| 36627 | std::next(MachineBasicBlock::iterator(FirstCMOV)), | |||
| 36628 | ThisMBB->end()); | |||
| 36629 | SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); | |||
| 36630 | ||||
| 36631 | // Fallthrough block for ThisMBB. | |||
| 36632 | ThisMBB->addSuccessor(FirstInsertedMBB); | |||
| 36633 | // The true block target of the first branch is always SinkMBB. | |||
| 36634 | ThisMBB->addSuccessor(SinkMBB); | |||
| 36635 | // Fallthrough block for FirstInsertedMBB. | |||
| 36636 | FirstInsertedMBB->addSuccessor(SecondInsertedMBB); | |||
| 36637 | // The true block for the branch of FirstInsertedMBB. | |||
| 36638 | FirstInsertedMBB->addSuccessor(SinkMBB); | |||
| 36639 | // This is fallthrough. | |||
| 36640 | SecondInsertedMBB->addSuccessor(SinkMBB); | |||
| 36641 | ||||
| 36642 | // Create the conditional branch instructions. | |||
| 36643 | X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); | |||
| 36644 | BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(FirstCC); | |||
| 36645 | ||||
| 36646 | X86::CondCode SecondCC = | |||
| 36647 | X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); | |||
| 36648 | BuildMI(FirstInsertedMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(SecondCC); | |||
| 36649 | ||||
| 36650 | // SinkMBB: | |||
| 36651 | // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] | |||
| 36652 | Register DestReg = SecondCascadedCMOV.getOperand(0).getReg(); | |||
| 36653 | Register Op1Reg = FirstCMOV.getOperand(1).getReg(); | |||
| 36654 | Register Op2Reg = FirstCMOV.getOperand(2).getReg(); | |||
| 36655 | MachineInstrBuilder MIB = | |||
| 36656 | BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) | |||
| 36657 | .addReg(Op1Reg) | |||
| 36658 | .addMBB(SecondInsertedMBB) | |||
| 36659 | .addReg(Op2Reg) | |||
| 36660 | .addMBB(ThisMBB); | |||
| 36661 | ||||
| 36662 | // The second SecondInsertedMBB provides the same incoming value as the | |||
| 36663 | // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). | |||
| 36664 | MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); | |||
| 36665 | ||||
| 36666 | // Now remove the CMOVs. | |||
| 36667 | FirstCMOV.eraseFromParent(); | |||
| 36668 | SecondCascadedCMOV.eraseFromParent(); | |||
| 36669 | ||||
| 36670 | return SinkMBB; | |||
| 36671 | } | |||
| 36672 | ||||
| 36673 | MachineBasicBlock * | |||
| 36674 | X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, | |||
| 36675 | MachineBasicBlock *ThisMBB) const { | |||
| 36676 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 36677 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 36678 | ||||
| 36679 | // To "insert" a SELECT_CC instruction, we actually have to insert the | |||
| 36680 | // diamond control-flow pattern. The incoming instruction knows the | |||
| 36681 | // destination vreg to set, the condition code register to branch on, the | |||
| 36682 | // true/false values to select between and a branch opcode to use. | |||
| 36683 | ||||
| 36684 | // ThisMBB: | |||
| 36685 | // ... | |||
| 36686 | // TrueVal = ... | |||
| 36687 | // cmpTY ccX, r1, r2 | |||
| 36688 | // bCC copy1MBB | |||
| 36689 | // fallthrough --> FalseMBB | |||
| 36690 | ||||
| 36691 | // This code lowers all pseudo-CMOV instructions. Generally it lowers these | |||
| 36692 | // as described above, by inserting a BB, and then making a PHI at the join | |||
| 36693 | // point to select the true and false operands of the CMOV in the PHI. | |||
| 36694 | // | |||
| 36695 | // The code also handles two different cases of multiple CMOV opcodes | |||
| 36696 | // in a row. | |||
| 36697 | // | |||
| 36698 | // Case 1: | |||
| 36699 | // In this case, there are multiple CMOVs in a row, all which are based on | |||
| 36700 | // the same condition setting (or the exact opposite condition setting). | |||
| 36701 | // In this case we can lower all the CMOVs using a single inserted BB, and | |||
| 36702 | // then make a number of PHIs at the join point to model the CMOVs. The only | |||
| 36703 | // trickiness here, is that in a case like: | |||
| 36704 | // | |||
| 36705 | // t2 = CMOV cond1 t1, f1 | |||
| 36706 | // t3 = CMOV cond1 t2, f2 | |||
| 36707 | // | |||
| 36708 | // when rewriting this into PHIs, we have to perform some renaming on the | |||
| 36709 | // temps since you cannot have a PHI operand refer to a PHI result earlier | |||
| 36710 | // in the same block. The "simple" but wrong lowering would be: | |||
| 36711 | // | |||
| 36712 | // t2 = PHI t1(BB1), f1(BB2) | |||
| 36713 | // t3 = PHI t2(BB1), f2(BB2) | |||
| 36714 | // | |||
| 36715 | // but clearly t2 is not defined in BB1, so that is incorrect. The proper | |||
| 36716 | // renaming is to note that on the path through BB1, t2 is really just a | |||
| 36717 | // copy of t1, and do that renaming, properly generating: | |||
| 36718 | // | |||
| 36719 | // t2 = PHI t1(BB1), f1(BB2) | |||
| 36720 | // t3 = PHI t1(BB1), f2(BB2) | |||
| 36721 | // | |||
| 36722 | // Case 2: | |||
| 36723 | // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate | |||
| 36724 | // function - EmitLoweredCascadedSelect. | |||
| 36725 | ||||
| 36726 | X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); | |||
| 36727 | X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); | |||
| 36728 | MachineInstr *LastCMOV = &MI; | |||
| 36729 | MachineBasicBlock::iterator NextMIIt = MachineBasicBlock::iterator(MI); | |||
| 36730 | ||||
| 36731 | // Check for case 1, where there are multiple CMOVs with the same condition | |||
| 36732 | // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the | |||
| 36733 | // number of jumps the most. | |||
| 36734 | ||||
| 36735 | if (isCMOVPseudo(MI)) { | |||
| 36736 | // See if we have a string of CMOVS with the same condition. Skip over | |||
| 36737 | // intervening debug insts. | |||
| 36738 | while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && | |||
| 36739 | (NextMIIt->getOperand(3).getImm() == CC || | |||
| 36740 | NextMIIt->getOperand(3).getImm() == OppCC)) { | |||
| 36741 | LastCMOV = &*NextMIIt; | |||
| 36742 | NextMIIt = next_nodbg(NextMIIt, ThisMBB->end()); | |||
| 36743 | } | |||
| 36744 | } | |||
| 36745 | ||||
| 36746 | // This checks for case 2, but only do this if we didn't already find | |||
| 36747 | // case 1, as indicated by LastCMOV == MI. | |||
| 36748 | if (LastCMOV == &MI && NextMIIt != ThisMBB->end() && | |||
| 36749 | NextMIIt->getOpcode() == MI.getOpcode() && | |||
| 36750 | NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && | |||
| 36751 | NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && | |||
| 36752 | NextMIIt->getOperand(1).isKill()) { | |||
| 36753 | return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB); | |||
| 36754 | } | |||
| 36755 | ||||
| 36756 | const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); | |||
| 36757 | MachineFunction *F = ThisMBB->getParent(); | |||
| 36758 | MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); | |||
| 36759 | MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); | |||
| 36760 | ||||
| 36761 | MachineFunction::iterator It = ++ThisMBB->getIterator(); | |||
| 36762 | F->insert(It, FalseMBB); | |||
| 36763 | F->insert(It, SinkMBB); | |||
| 36764 | ||||
| 36765 | // If the EFLAGS register isn't dead in the terminator, then claim that it's | |||
| 36766 | // live into the sink and copy blocks. | |||
| 36767 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 36768 | if (!LastCMOV->killsRegister(X86::EFLAGS) && | |||
| 36769 | !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) { | |||
| 36770 | FalseMBB->addLiveIn(X86::EFLAGS); | |||
| 36771 | SinkMBB->addLiveIn(X86::EFLAGS); | |||
| 36772 | } | |||
| 36773 | ||||
| 36774 | // Transfer any debug instructions inside the CMOV sequence to the sunk block. | |||
| 36775 | auto DbgRange = llvm::make_range(MachineBasicBlock::iterator(MI), | |||
| 36776 | MachineBasicBlock::iterator(LastCMOV)); | |||
| 36777 | for (MachineInstr &MI : llvm::make_early_inc_range(DbgRange)) | |||
| 36778 | if (MI.isDebugInstr()) | |||
| 36779 | SinkMBB->push_back(MI.removeFromParent()); | |||
| 36780 | ||||
| 36781 | // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. | |||
| 36782 | SinkMBB->splice(SinkMBB->end(), ThisMBB, | |||
| 36783 | std::next(MachineBasicBlock::iterator(LastCMOV)), | |||
| 36784 | ThisMBB->end()); | |||
| 36785 | SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); | |||
| 36786 | ||||
| 36787 | // Fallthrough block for ThisMBB. | |||
| 36788 | ThisMBB->addSuccessor(FalseMBB); | |||
| 36789 | // The true block target of the first (or only) branch is always a SinkMBB. | |||
| 36790 | ThisMBB->addSuccessor(SinkMBB); | |||
| 36791 | // Fallthrough block for FalseMBB. | |||
| 36792 | FalseMBB->addSuccessor(SinkMBB); | |||
| 36793 | ||||
| 36794 | // Create the conditional branch instruction. | |||
| 36795 | BuildMI(ThisMBB, DL, TII->get(X86::JCC_1)).addMBB(SinkMBB).addImm(CC); | |||
| 36796 | ||||
| 36797 | // SinkMBB: | |||
| 36798 | // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] | |||
| 36799 | // ... | |||
| 36800 | MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); | |||
| 36801 | MachineBasicBlock::iterator MIItEnd = | |||
| 36802 | std::next(MachineBasicBlock::iterator(LastCMOV)); | |||
| 36803 | createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB); | |||
| 36804 | ||||
| 36805 | // Now remove the CMOV(s). | |||
| 36806 | ThisMBB->erase(MIItBegin, MIItEnd); | |||
| 36807 | ||||
| 36808 | return SinkMBB; | |||
| 36809 | } | |||
| 36810 | ||||
| 36811 | static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) { | |||
| 36812 | if (IsLP64) { | |||
| 36813 | if (isInt<8>(Imm)) | |||
| 36814 | return X86::SUB64ri8; | |||
| 36815 | return X86::SUB64ri32; | |||
| 36816 | } else { | |||
| 36817 | if (isInt<8>(Imm)) | |||
| 36818 | return X86::SUB32ri8; | |||
| 36819 | return X86::SUB32ri; | |||
| 36820 | } | |||
| 36821 | } | |||
| 36822 | ||||
| 36823 | MachineBasicBlock * | |||
| 36824 | X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI, | |||
| 36825 | MachineBasicBlock *MBB) const { | |||
| 36826 | MachineFunction *MF = MBB->getParent(); | |||
| 36827 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 36828 | const X86FrameLowering &TFI = *Subtarget.getFrameLowering(); | |||
| 36829 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 36830 | const BasicBlock *LLVM_BB = MBB->getBasicBlock(); | |||
| 36831 | ||||
| 36832 | const unsigned ProbeSize = getStackProbeSize(*MF); | |||
| 36833 | ||||
| 36834 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 36835 | MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36836 | MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36837 | MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36838 | ||||
| 36839 | MachineFunction::iterator MBBIter = ++MBB->getIterator(); | |||
| 36840 | MF->insert(MBBIter, testMBB); | |||
| 36841 | MF->insert(MBBIter, blockMBB); | |||
| 36842 | MF->insert(MBBIter, tailMBB); | |||
| 36843 | ||||
| 36844 | Register sizeVReg = MI.getOperand(1).getReg(); | |||
| 36845 | ||||
| 36846 | Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP; | |||
| 36847 | ||||
| 36848 | Register TmpStackPtr = MRI.createVirtualRegister( | |||
| 36849 | TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); | |||
| 36850 | Register FinalStackPtr = MRI.createVirtualRegister( | |||
| 36851 | TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass); | |||
| 36852 | ||||
| 36853 | BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr) | |||
| 36854 | .addReg(physSPReg); | |||
| 36855 | { | |||
| 36856 | const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr; | |||
| 36857 | BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr) | |||
| 36858 | .addReg(TmpStackPtr) | |||
| 36859 | .addReg(sizeVReg); | |||
| 36860 | } | |||
| 36861 | ||||
| 36862 | // test rsp size | |||
| 36863 | ||||
| 36864 | BuildMI(testMBB, DL, | |||
| 36865 | TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr)) | |||
| 36866 | .addReg(FinalStackPtr) | |||
| 36867 | .addReg(physSPReg); | |||
| 36868 | ||||
| 36869 | BuildMI(testMBB, DL, TII->get(X86::JCC_1)) | |||
| 36870 | .addMBB(tailMBB) | |||
| 36871 | .addImm(X86::COND_GE); | |||
| 36872 | testMBB->addSuccessor(blockMBB); | |||
| 36873 | testMBB->addSuccessor(tailMBB); | |||
| 36874 | ||||
| 36875 | // Touch the block then extend it. This is done on the opposite side of | |||
| 36876 | // static probe where we allocate then touch, to avoid the need of probing the | |||
| 36877 | // tail of the static alloca. Possible scenarios are: | |||
| 36878 | // | |||
| 36879 | // + ---- <- ------------ <- ------------- <- ------------ + | |||
| 36880 | // | | | |||
| 36881 | // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] + | |||
| 36882 | // | | | |||
| 36883 | // + <- ----------- <- ------------ <- ----------- <- ------------ + | |||
| 36884 | // | |||
| 36885 | // The property we want to enforce is to never have more than [page alloc] between two probes. | |||
| 36886 | ||||
| 36887 | const unsigned XORMIOpc = | |||
| 36888 | TFI.Uses64BitFramePtr ? X86::XOR64mi8 : X86::XOR32mi8; | |||
| 36889 | addRegOffset(BuildMI(blockMBB, DL, TII->get(XORMIOpc)), physSPReg, false, 0) | |||
| 36890 | .addImm(0); | |||
| 36891 | ||||
| 36892 | BuildMI(blockMBB, DL, | |||
| 36893 | TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg) | |||
| 36894 | .addReg(physSPReg) | |||
| 36895 | .addImm(ProbeSize); | |||
| 36896 | ||||
| 36897 | ||||
| 36898 | BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB); | |||
| 36899 | blockMBB->addSuccessor(testMBB); | |||
| 36900 | ||||
| 36901 | // Replace original instruction by the expected stack ptr | |||
| 36902 | BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg()) | |||
| 36903 | .addReg(FinalStackPtr); | |||
| 36904 | ||||
| 36905 | tailMBB->splice(tailMBB->end(), MBB, | |||
| 36906 | std::next(MachineBasicBlock::iterator(MI)), MBB->end()); | |||
| 36907 | tailMBB->transferSuccessorsAndUpdatePHIs(MBB); | |||
| 36908 | MBB->addSuccessor(testMBB); | |||
| 36909 | ||||
| 36910 | // Delete the original pseudo instruction. | |||
| 36911 | MI.eraseFromParent(); | |||
| 36912 | ||||
| 36913 | // And we're done. | |||
| 36914 | return tailMBB; | |||
| 36915 | } | |||
| 36916 | ||||
| 36917 | MachineBasicBlock * | |||
| 36918 | X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI, | |||
| 36919 | MachineBasicBlock *BB) const { | |||
| 36920 | MachineFunction *MF = BB->getParent(); | |||
| 36921 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 36922 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 36923 | const BasicBlock *LLVM_BB = BB->getBasicBlock(); | |||
| 36924 | ||||
| 36925 | assert(MF->shouldSplitStack())(static_cast <bool> (MF->shouldSplitStack()) ? void ( 0) : __assert_fail ("MF->shouldSplitStack()", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 36925, __extension__ __PRETTY_FUNCTION__)); | |||
| 36926 | ||||
| 36927 | const bool Is64Bit = Subtarget.is64Bit(); | |||
| 36928 | const bool IsLP64 = Subtarget.isTarget64BitLP64(); | |||
| 36929 | ||||
| 36930 | const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS; | |||
| 36931 | const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30; | |||
| 36932 | ||||
| 36933 | // BB: | |||
| 36934 | // ... [Till the alloca] | |||
| 36935 | // If stacklet is not large enough, jump to mallocMBB | |||
| 36936 | // | |||
| 36937 | // bumpMBB: | |||
| 36938 | // Allocate by subtracting from RSP | |||
| 36939 | // Jump to continueMBB | |||
| 36940 | // | |||
| 36941 | // mallocMBB: | |||
| 36942 | // Allocate by call to runtime | |||
| 36943 | // | |||
| 36944 | // continueMBB: | |||
| 36945 | // ... | |||
| 36946 | // [rest of original BB] | |||
| 36947 | // | |||
| 36948 | ||||
| 36949 | MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36950 | MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36951 | MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB); | |||
| 36952 | ||||
| 36953 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 36954 | const TargetRegisterClass *AddrRegClass = | |||
| 36955 | getRegClassFor(getPointerTy(MF->getDataLayout())); | |||
| 36956 | ||||
| 36957 | Register mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass), | |||
| 36958 | bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass), | |||
| 36959 | tmpSPVReg = MRI.createVirtualRegister(AddrRegClass), | |||
| 36960 | SPLimitVReg = MRI.createVirtualRegister(AddrRegClass), | |||
| 36961 | sizeVReg = MI.getOperand(1).getReg(), | |||
| 36962 | physSPReg = | |||
| 36963 | IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP; | |||
| 36964 | ||||
| 36965 | MachineFunction::iterator MBBIter = ++BB->getIterator(); | |||
| 36966 | ||||
| 36967 | MF->insert(MBBIter, bumpMBB); | |||
| 36968 | MF->insert(MBBIter, mallocMBB); | |||
| 36969 | MF->insert(MBBIter, continueMBB); | |||
| 36970 | ||||
| 36971 | continueMBB->splice(continueMBB->begin(), BB, | |||
| 36972 | std::next(MachineBasicBlock::iterator(MI)), BB->end()); | |||
| 36973 | continueMBB->transferSuccessorsAndUpdatePHIs(BB); | |||
| 36974 | ||||
| 36975 | // Add code to the main basic block to check if the stack limit has been hit, | |||
| 36976 | // and if so, jump to mallocMBB otherwise to bumpMBB. | |||
| 36977 | BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg); | |||
| 36978 | BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg) | |||
| 36979 | .addReg(tmpSPVReg).addReg(sizeVReg); | |||
| 36980 | BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr)) | |||
| 36981 | .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg) | |||
| 36982 | .addReg(SPLimitVReg); | |||
| 36983 | BuildMI(BB, DL, TII->get(X86::JCC_1)).addMBB(mallocMBB).addImm(X86::COND_G); | |||
| 36984 | ||||
| 36985 | // bumpMBB simply decreases the stack pointer, since we know the current | |||
| 36986 | // stacklet has enough space. | |||
| 36987 | BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg) | |||
| 36988 | .addReg(SPLimitVReg); | |||
| 36989 | BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg) | |||
| 36990 | .addReg(SPLimitVReg); | |||
| 36991 | BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); | |||
| 36992 | ||||
| 36993 | // Calls into a routine in libgcc to allocate more space from the heap. | |||
| 36994 | const uint32_t *RegMask = | |||
| 36995 | Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C); | |||
| 36996 | if (IsLP64) { | |||
| 36997 | BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI) | |||
| 36998 | .addReg(sizeVReg); | |||
| 36999 | BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) | |||
| 37000 | .addExternalSymbol("__morestack_allocate_stack_space") | |||
| 37001 | .addRegMask(RegMask) | |||
| 37002 | .addReg(X86::RDI, RegState::Implicit) | |||
| 37003 | .addReg(X86::RAX, RegState::ImplicitDefine); | |||
| 37004 | } else if (Is64Bit) { | |||
| 37005 | BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI) | |||
| 37006 | .addReg(sizeVReg); | |||
| 37007 | BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32)) | |||
| 37008 | .addExternalSymbol("__morestack_allocate_stack_space") | |||
| 37009 | .addRegMask(RegMask) | |||
| 37010 | .addReg(X86::EDI, RegState::Implicit) | |||
| 37011 | .addReg(X86::EAX, RegState::ImplicitDefine); | |||
| 37012 | } else { | |||
| 37013 | BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg) | |||
| 37014 | .addImm(12); | |||
| 37015 | BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg); | |||
| 37016 | BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32)) | |||
| 37017 | .addExternalSymbol("__morestack_allocate_stack_space") | |||
| 37018 | .addRegMask(RegMask) | |||
| 37019 | .addReg(X86::EAX, RegState::ImplicitDefine); | |||
| 37020 | } | |||
| 37021 | ||||
| 37022 | if (!Is64Bit) | |||
| 37023 | BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg) | |||
| 37024 | .addImm(16); | |||
| 37025 | ||||
| 37026 | BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg) | |||
| 37027 | .addReg(IsLP64 ? X86::RAX : X86::EAX); | |||
| 37028 | BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB); | |||
| 37029 | ||||
| 37030 | // Set up the CFG correctly. | |||
| 37031 | BB->addSuccessor(bumpMBB); | |||
| 37032 | BB->addSuccessor(mallocMBB); | |||
| 37033 | mallocMBB->addSuccessor(continueMBB); | |||
| 37034 | bumpMBB->addSuccessor(continueMBB); | |||
| 37035 | ||||
| 37036 | // Take care of the PHI nodes. | |||
| 37037 | BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI), | |||
| 37038 | MI.getOperand(0).getReg()) | |||
| 37039 | .addReg(mallocPtrVReg) | |||
| 37040 | .addMBB(mallocMBB) | |||
| 37041 | .addReg(bumpSPPtrVReg) | |||
| 37042 | .addMBB(bumpMBB); | |||
| 37043 | ||||
| 37044 | // Delete the original pseudo instruction. | |||
| 37045 | MI.eraseFromParent(); | |||
| 37046 | ||||
| 37047 | // And we're done. | |||
| 37048 | return continueMBB; | |||
| 37049 | } | |||
| 37050 | ||||
| 37051 | MachineBasicBlock * | |||
| 37052 | X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, | |||
| 37053 | MachineBasicBlock *BB) const { | |||
| 37054 | MachineFunction *MF = BB->getParent(); | |||
| 37055 | const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); | |||
| 37056 | MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB(); | |||
| 37057 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37058 | ||||
| 37059 | assert(!isAsynchronousEHPersonality((static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality (MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!" ) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__ __PRETTY_FUNCTION__)) | |||
| 37060 | classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality (MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!" ) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__ __PRETTY_FUNCTION__)) | |||
| 37061 | "SEH does not use catchret!")(static_cast <bool> (!isAsynchronousEHPersonality( classifyEHPersonality (MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!" ) ? void (0) : __assert_fail ("!isAsynchronousEHPersonality( classifyEHPersonality(MF->getFunction().getPersonalityFn())) && \"SEH does not use catchret!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37061, __extension__ __PRETTY_FUNCTION__)); | |||
| 37062 | ||||
| 37063 | // Only 32-bit EH needs to worry about manually restoring stack pointers. | |||
| 37064 | if (!Subtarget.is32Bit()) | |||
| 37065 | return BB; | |||
| 37066 | ||||
| 37067 | // C++ EH creates a new target block to hold the restore code, and wires up | |||
| 37068 | // the new block to the return destination with a normal JMP_4. | |||
| 37069 | MachineBasicBlock *RestoreMBB = | |||
| 37070 | MF->CreateMachineBasicBlock(BB->getBasicBlock()); | |||
| 37071 | assert(BB->succ_size() == 1)(static_cast <bool> (BB->succ_size() == 1) ? void (0 ) : __assert_fail ("BB->succ_size() == 1", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 37071, __extension__ __PRETTY_FUNCTION__)); | |||
| 37072 | MF->insert(std::next(BB->getIterator()), RestoreMBB); | |||
| 37073 | RestoreMBB->transferSuccessorsAndUpdatePHIs(BB); | |||
| 37074 | BB->addSuccessor(RestoreMBB); | |||
| 37075 | MI.getOperand(0).setMBB(RestoreMBB); | |||
| 37076 | ||||
| 37077 | // Marking this as an EH pad but not a funclet entry block causes PEI to | |||
| 37078 | // restore stack pointers in the block. | |||
| 37079 | RestoreMBB->setIsEHPad(true); | |||
| 37080 | ||||
| 37081 | auto RestoreMBBI = RestoreMBB->begin(); | |||
| 37082 | BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB); | |||
| 37083 | return BB; | |||
| 37084 | } | |||
| 37085 | ||||
| 37086 | MachineBasicBlock * | |||
| 37087 | X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, | |||
| 37088 | MachineBasicBlock *BB) const { | |||
| 37089 | // So, here we replace TLSADDR with the sequence: | |||
| 37090 | // adjust_stackdown -> TLSADDR -> adjust_stackup. | |||
| 37091 | // We need this because TLSADDR is lowered into calls | |||
| 37092 | // inside MC, therefore without the two markers shrink-wrapping | |||
| 37093 | // may push the prologue/epilogue pass them. | |||
| 37094 | const TargetInstrInfo &TII = *Subtarget.getInstrInfo(); | |||
| 37095 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37096 | MachineFunction &MF = *BB->getParent(); | |||
| 37097 | ||||
| 37098 | // Emit CALLSEQ_START right before the instruction. | |||
| 37099 | unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); | |||
| 37100 | MachineInstrBuilder CallseqStart = | |||
| 37101 | BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); | |||
| 37102 | BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); | |||
| 37103 | ||||
| 37104 | // Emit CALLSEQ_END right after the instruction. | |||
| 37105 | // We don't call erase from parent because we want to keep the | |||
| 37106 | // original instruction around. | |||
| 37107 | unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); | |||
| 37108 | MachineInstrBuilder CallseqEnd = | |||
| 37109 | BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0); | |||
| 37110 | BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd); | |||
| 37111 | ||||
| 37112 | return BB; | |||
| 37113 | } | |||
| 37114 | ||||
| 37115 | MachineBasicBlock * | |||
| 37116 | X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI, | |||
| 37117 | MachineBasicBlock *BB) const { | |||
| 37118 | // This is pretty easy. We're taking the value that we received from | |||
| 37119 | // our load from the relocation, sticking it in either RDI (x86-64) | |||
| 37120 | // or EAX and doing an indirect call. The return value will then | |||
| 37121 | // be in the normal return register. | |||
| 37122 | MachineFunction *F = BB->getParent(); | |||
| 37123 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37124 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37125 | ||||
| 37126 | assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?")(static_cast <bool> (Subtarget.isTargetDarwin() && "Darwin only instr emitted?") ? void (0) : __assert_fail ("Subtarget.isTargetDarwin() && \"Darwin only instr emitted?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37126, __extension__ __PRETTY_FUNCTION__)); | |||
| 37127 | assert(MI.getOperand(3).isGlobal() && "This should be a global")(static_cast <bool> (MI.getOperand(3).isGlobal() && "This should be a global") ? void (0) : __assert_fail ("MI.getOperand(3).isGlobal() && \"This should be a global\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37127, __extension__ __PRETTY_FUNCTION__)); | |||
| 37128 | ||||
| 37129 | // Get a register mask for the lowered call. | |||
| 37130 | // FIXME: The 32-bit calls have non-standard calling conventions. Use a | |||
| 37131 | // proper register mask. | |||
| 37132 | const uint32_t *RegMask = | |||
| 37133 | Subtarget.is64Bit() ? | |||
| 37134 | Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() : | |||
| 37135 | Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C); | |||
| 37136 | if (Subtarget.is64Bit()) { | |||
| 37137 | MachineInstrBuilder MIB = | |||
| 37138 | BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI) | |||
| 37139 | .addReg(X86::RIP) | |||
| 37140 | .addImm(0) | |||
| 37141 | .addReg(0) | |||
| 37142 | .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, | |||
| 37143 | MI.getOperand(3).getTargetFlags()) | |||
| 37144 | .addReg(0); | |||
| 37145 | MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m)); | |||
| 37146 | addDirectMem(MIB, X86::RDI); | |||
| 37147 | MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask); | |||
| 37148 | } else if (!isPositionIndependent()) { | |||
| 37149 | MachineInstrBuilder MIB = | |||
| 37150 | BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) | |||
| 37151 | .addReg(0) | |||
| 37152 | .addImm(0) | |||
| 37153 | .addReg(0) | |||
| 37154 | .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, | |||
| 37155 | MI.getOperand(3).getTargetFlags()) | |||
| 37156 | .addReg(0); | |||
| 37157 | MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); | |||
| 37158 | addDirectMem(MIB, X86::EAX); | |||
| 37159 | MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); | |||
| 37160 | } else { | |||
| 37161 | MachineInstrBuilder MIB = | |||
| 37162 | BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX) | |||
| 37163 | .addReg(TII->getGlobalBaseReg(F)) | |||
| 37164 | .addImm(0) | |||
| 37165 | .addReg(0) | |||
| 37166 | .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, | |||
| 37167 | MI.getOperand(3).getTargetFlags()) | |||
| 37168 | .addReg(0); | |||
| 37169 | MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m)); | |||
| 37170 | addDirectMem(MIB, X86::EAX); | |||
| 37171 | MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask); | |||
| 37172 | } | |||
| 37173 | ||||
| 37174 | MI.eraseFromParent(); // The pseudo instruction is gone now. | |||
| 37175 | return BB; | |||
| 37176 | } | |||
| 37177 | ||||
| 37178 | static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) { | |||
| 37179 | switch (RPOpc) { | |||
| 37180 | case X86::INDIRECT_THUNK_CALL32: | |||
| 37181 | return X86::CALLpcrel32; | |||
| 37182 | case X86::INDIRECT_THUNK_CALL64: | |||
| 37183 | return X86::CALL64pcrel32; | |||
| 37184 | case X86::INDIRECT_THUNK_TCRETURN32: | |||
| 37185 | return X86::TCRETURNdi; | |||
| 37186 | case X86::INDIRECT_THUNK_TCRETURN64: | |||
| 37187 | return X86::TCRETURNdi64; | |||
| 37188 | } | |||
| 37189 | llvm_unreachable("not indirect thunk opcode")::llvm::llvm_unreachable_internal("not indirect thunk opcode" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37189); | |||
| 37190 | } | |||
| 37191 | ||||
| 37192 | static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget, | |||
| 37193 | unsigned Reg) { | |||
| 37194 | if (Subtarget.useRetpolineExternalThunk()) { | |||
| 37195 | // When using an external thunk for retpolines, we pick names that match the | |||
| 37196 | // names GCC happens to use as well. This helps simplify the implementation | |||
| 37197 | // of the thunks for kernels where they have no easy ability to create | |||
| 37198 | // aliases and are doing non-trivial configuration of the thunk's body. For | |||
| 37199 | // example, the Linux kernel will do boot-time hot patching of the thunk | |||
| 37200 | // bodies and cannot easily export aliases of these to loaded modules. | |||
| 37201 | // | |||
| 37202 | // Note that at any point in the future, we may need to change the semantics | |||
| 37203 | // of how we implement retpolines and at that time will likely change the | |||
| 37204 | // name of the called thunk. Essentially, there is no hard guarantee that | |||
| 37205 | // LLVM will generate calls to specific thunks, we merely make a best-effort | |||
| 37206 | // attempt to help out kernels and other systems where duplicating the | |||
| 37207 | // thunks is costly. | |||
| 37208 | switch (Reg) { | |||
| 37209 | case X86::EAX: | |||
| 37210 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37210, __extension__ __PRETTY_FUNCTION__)); | |||
| 37211 | return "__x86_indirect_thunk_eax"; | |||
| 37212 | case X86::ECX: | |||
| 37213 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37213, __extension__ __PRETTY_FUNCTION__)); | |||
| 37214 | return "__x86_indirect_thunk_ecx"; | |||
| 37215 | case X86::EDX: | |||
| 37216 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37216, __extension__ __PRETTY_FUNCTION__)); | |||
| 37217 | return "__x86_indirect_thunk_edx"; | |||
| 37218 | case X86::EDI: | |||
| 37219 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37219, __extension__ __PRETTY_FUNCTION__)); | |||
| 37220 | return "__x86_indirect_thunk_edi"; | |||
| 37221 | case X86::R11: | |||
| 37222 | assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37222, __extension__ __PRETTY_FUNCTION__)); | |||
| 37223 | return "__x86_indirect_thunk_r11"; | |||
| 37224 | } | |||
| 37225 | llvm_unreachable("unexpected reg for external indirect thunk")::llvm::llvm_unreachable_internal("unexpected reg for external indirect thunk" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37225); | |||
| 37226 | } | |||
| 37227 | ||||
| 37228 | if (Subtarget.useRetpolineIndirectCalls() || | |||
| 37229 | Subtarget.useRetpolineIndirectBranches()) { | |||
| 37230 | // When targeting an internal COMDAT thunk use an LLVM-specific name. | |||
| 37231 | switch (Reg) { | |||
| 37232 | case X86::EAX: | |||
| 37233 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37233, __extension__ __PRETTY_FUNCTION__)); | |||
| 37234 | return "__llvm_retpoline_eax"; | |||
| 37235 | case X86::ECX: | |||
| 37236 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37236, __extension__ __PRETTY_FUNCTION__)); | |||
| 37237 | return "__llvm_retpoline_ecx"; | |||
| 37238 | case X86::EDX: | |||
| 37239 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37239, __extension__ __PRETTY_FUNCTION__)); | |||
| 37240 | return "__llvm_retpoline_edx"; | |||
| 37241 | case X86::EDI: | |||
| 37242 | assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!")(static_cast <bool> (!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!" ) ? void (0) : __assert_fail ("!Subtarget.is64Bit() && \"Should not be using a 32-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37242, __extension__ __PRETTY_FUNCTION__)); | |||
| 37243 | return "__llvm_retpoline_edi"; | |||
| 37244 | case X86::R11: | |||
| 37245 | assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37245, __extension__ __PRETTY_FUNCTION__)); | |||
| 37246 | return "__llvm_retpoline_r11"; | |||
| 37247 | } | |||
| 37248 | llvm_unreachable("unexpected reg for retpoline")::llvm::llvm_unreachable_internal("unexpected reg for retpoline" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37248); | |||
| 37249 | } | |||
| 37250 | ||||
| 37251 | if (Subtarget.useLVIControlFlowIntegrity()) { | |||
| 37252 | assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!")(static_cast <bool> (Subtarget.is64Bit() && "Should not be using a 64-bit thunk!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Should not be using a 64-bit thunk!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37252, __extension__ __PRETTY_FUNCTION__)); | |||
| 37253 | return "__llvm_lvi_thunk_r11"; | |||
| 37254 | } | |||
| 37255 | llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature")::llvm::llvm_unreachable_internal("getIndirectThunkSymbol() invoked without thunk feature" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37255); | |||
| 37256 | } | |||
| 37257 | ||||
| 37258 | MachineBasicBlock * | |||
| 37259 | X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI, | |||
| 37260 | MachineBasicBlock *BB) const { | |||
| 37261 | // Copy the virtual register into the R11 physical register and | |||
| 37262 | // call the retpoline thunk. | |||
| 37263 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37264 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37265 | Register CalleeVReg = MI.getOperand(0).getReg(); | |||
| 37266 | unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode()); | |||
| 37267 | ||||
| 37268 | // Find an available scratch register to hold the callee. On 64-bit, we can | |||
| 37269 | // just use R11, but we scan for uses anyway to ensure we don't generate | |||
| 37270 | // incorrect code. On 32-bit, we use one of EAX, ECX, or EDX that isn't | |||
| 37271 | // already a register use operand to the call to hold the callee. If none | |||
| 37272 | // are available, use EDI instead. EDI is chosen because EBX is the PIC base | |||
| 37273 | // register and ESI is the base pointer to realigned stack frames with VLAs. | |||
| 37274 | SmallVector<unsigned, 3> AvailableRegs; | |||
| 37275 | if (Subtarget.is64Bit()) | |||
| 37276 | AvailableRegs.push_back(X86::R11); | |||
| 37277 | else | |||
| 37278 | AvailableRegs.append({X86::EAX, X86::ECX, X86::EDX, X86::EDI}); | |||
| 37279 | ||||
| 37280 | // Zero out any registers that are already used. | |||
| 37281 | for (const auto &MO : MI.operands()) { | |||
| 37282 | if (MO.isReg() && MO.isUse()) | |||
| 37283 | for (unsigned &Reg : AvailableRegs) | |||
| 37284 | if (Reg == MO.getReg()) | |||
| 37285 | Reg = 0; | |||
| 37286 | } | |||
| 37287 | ||||
| 37288 | // Choose the first remaining non-zero available register. | |||
| 37289 | unsigned AvailableReg = 0; | |||
| 37290 | for (unsigned MaybeReg : AvailableRegs) { | |||
| 37291 | if (MaybeReg) { | |||
| 37292 | AvailableReg = MaybeReg; | |||
| 37293 | break; | |||
| 37294 | } | |||
| 37295 | } | |||
| 37296 | if (!AvailableReg) | |||
| 37297 | report_fatal_error("calling convention incompatible with retpoline, no " | |||
| 37298 | "available registers"); | |||
| 37299 | ||||
| 37300 | const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg); | |||
| 37301 | ||||
| 37302 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg) | |||
| 37303 | .addReg(CalleeVReg); | |||
| 37304 | MI.getOperand(0).ChangeToES(Symbol); | |||
| 37305 | MI.setDesc(TII->get(Opc)); | |||
| 37306 | MachineInstrBuilder(*BB->getParent(), &MI) | |||
| 37307 | .addReg(AvailableReg, RegState::Implicit | RegState::Kill); | |||
| 37308 | return BB; | |||
| 37309 | } | |||
| 37310 | ||||
| 37311 | /// SetJmp implies future control flow change upon calling the corresponding | |||
| 37312 | /// LongJmp. | |||
| 37313 | /// Instead of using the 'return' instruction, the long jump fixes the stack and | |||
| 37314 | /// performs an indirect branch. To do so it uses the registers that were stored | |||
| 37315 | /// in the jump buffer (when calling SetJmp). | |||
| 37316 | /// In case the shadow stack is enabled we need to fix it as well, because some | |||
| 37317 | /// return addresses will be skipped. | |||
| 37318 | /// The function will save the SSP for future fixing in the function | |||
| 37319 | /// emitLongJmpShadowStackFix. | |||
| 37320 | /// \sa emitLongJmpShadowStackFix | |||
| 37321 | /// \param [in] MI The temporary Machine Instruction for the builtin. | |||
| 37322 | /// \param [in] MBB The Machine Basic Block that will be modified. | |||
| 37323 | void X86TargetLowering::emitSetJmpShadowStackFix(MachineInstr &MI, | |||
| 37324 | MachineBasicBlock *MBB) const { | |||
| 37325 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37326 | MachineFunction *MF = MBB->getParent(); | |||
| 37327 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37328 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 37329 | MachineInstrBuilder MIB; | |||
| 37330 | ||||
| 37331 | // Memory Reference. | |||
| 37332 | SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(), | |||
| 37333 | MI.memoperands_end()); | |||
| 37334 | ||||
| 37335 | // Initialize a register with zero. | |||
| 37336 | MVT PVT = getPointerTy(MF->getDataLayout()); | |||
| 37337 | const TargetRegisterClass *PtrRC = getRegClassFor(PVT); | |||
| 37338 | Register ZReg = MRI.createVirtualRegister(PtrRC); | |||
| 37339 | unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr; | |||
| 37340 | BuildMI(*MBB, MI, DL, TII->get(XorRROpc)) | |||
| 37341 | .addDef(ZReg) | |||
| 37342 | .addReg(ZReg, RegState::Undef) | |||
| 37343 | .addReg(ZReg, RegState::Undef); | |||
| 37344 | ||||
| 37345 | // Read the current SSP Register value to the zeroed register. | |||
| 37346 | Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); | |||
| 37347 | unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; | |||
| 37348 | BuildMI(*MBB, MI, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); | |||
| 37349 | ||||
| 37350 | // Write the SSP register value to offset 3 in input memory buffer. | |||
| 37351 | unsigned PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; | |||
| 37352 | MIB = BuildMI(*MBB, MI, DL, TII->get(PtrStoreOpc)); | |||
| 37353 | const int64_t SSPOffset = 3 * PVT.getStoreSize(); | |||
| 37354 | const unsigned MemOpndSlot = 1; | |||
| 37355 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37356 | if (i == X86::AddrDisp) | |||
| 37357 | MIB.addDisp(MI.getOperand(MemOpndSlot + i), SSPOffset); | |||
| 37358 | else | |||
| 37359 | MIB.add(MI.getOperand(MemOpndSlot + i)); | |||
| 37360 | } | |||
| 37361 | MIB.addReg(SSPCopyReg); | |||
| 37362 | MIB.setMemRefs(MMOs); | |||
| 37363 | } | |||
| 37364 | ||||
| 37365 | MachineBasicBlock * | |||
| 37366 | X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI, | |||
| 37367 | MachineBasicBlock *MBB) const { | |||
| 37368 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37369 | MachineFunction *MF = MBB->getParent(); | |||
| 37370 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37371 | const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 37372 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 37373 | ||||
| 37374 | const BasicBlock *BB = MBB->getBasicBlock(); | |||
| 37375 | MachineFunction::iterator I = ++MBB->getIterator(); | |||
| 37376 | ||||
| 37377 | // Memory Reference | |||
| 37378 | SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(), | |||
| 37379 | MI.memoperands_end()); | |||
| 37380 | ||||
| 37381 | unsigned DstReg; | |||
| 37382 | unsigned MemOpndSlot = 0; | |||
| 37383 | ||||
| 37384 | unsigned CurOp = 0; | |||
| 37385 | ||||
| 37386 | DstReg = MI.getOperand(CurOp++).getReg(); | |||
| 37387 | const TargetRegisterClass *RC = MRI.getRegClass(DstReg); | |||
| 37388 | assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!")(static_cast <bool> (TRI->isTypeLegalForClass(*RC, MVT ::i32) && "Invalid destination!") ? void (0) : __assert_fail ("TRI->isTypeLegalForClass(*RC, MVT::i32) && \"Invalid destination!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37388, __extension__ __PRETTY_FUNCTION__)); | |||
| 37389 | (void)TRI; | |||
| 37390 | Register mainDstReg = MRI.createVirtualRegister(RC); | |||
| 37391 | Register restoreDstReg = MRI.createVirtualRegister(RC); | |||
| 37392 | ||||
| 37393 | MemOpndSlot = CurOp; | |||
| 37394 | ||||
| 37395 | MVT PVT = getPointerTy(MF->getDataLayout()); | |||
| 37396 | assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32 ) && "Invalid Pointer Size!") ? void (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__ __PRETTY_FUNCTION__)) | |||
| 37397 | "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32 ) && "Invalid Pointer Size!") ? void (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37397, __extension__ __PRETTY_FUNCTION__)); | |||
| 37398 | ||||
| 37399 | // For v = setjmp(buf), we generate | |||
| 37400 | // | |||
| 37401 | // thisMBB: | |||
| 37402 | // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB | |||
| 37403 | // SjLjSetup restoreMBB | |||
| 37404 | // | |||
| 37405 | // mainMBB: | |||
| 37406 | // v_main = 0 | |||
| 37407 | // | |||
| 37408 | // sinkMBB: | |||
| 37409 | // v = phi(main, restore) | |||
| 37410 | // | |||
| 37411 | // restoreMBB: | |||
| 37412 | // if base pointer being used, load it from frame | |||
| 37413 | // v_restore = 1 | |||
| 37414 | ||||
| 37415 | MachineBasicBlock *thisMBB = MBB; | |||
| 37416 | MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37417 | MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37418 | MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37419 | MF->insert(I, mainMBB); | |||
| 37420 | MF->insert(I, sinkMBB); | |||
| 37421 | MF->push_back(restoreMBB); | |||
| 37422 | restoreMBB->setMachineBlockAddressTaken(); | |||
| 37423 | ||||
| 37424 | MachineInstrBuilder MIB; | |||
| 37425 | ||||
| 37426 | // Transfer the remainder of BB and its successor edges to sinkMBB. | |||
| 37427 | sinkMBB->splice(sinkMBB->begin(), MBB, | |||
| 37428 | std::next(MachineBasicBlock::iterator(MI)), MBB->end()); | |||
| 37429 | sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); | |||
| 37430 | ||||
| 37431 | // thisMBB: | |||
| 37432 | unsigned PtrStoreOpc = 0; | |||
| 37433 | unsigned LabelReg = 0; | |||
| 37434 | const int64_t LabelOffset = 1 * PVT.getStoreSize(); | |||
| 37435 | bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && | |||
| 37436 | !isPositionIndependent(); | |||
| 37437 | ||||
| 37438 | // Prepare IP either in reg or imm. | |||
| 37439 | if (!UseImmLabel) { | |||
| 37440 | PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; | |||
| 37441 | const TargetRegisterClass *PtrRC = getRegClassFor(PVT); | |||
| 37442 | LabelReg = MRI.createVirtualRegister(PtrRC); | |||
| 37443 | if (Subtarget.is64Bit()) { | |||
| 37444 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg) | |||
| 37445 | .addReg(X86::RIP) | |||
| 37446 | .addImm(0) | |||
| 37447 | .addReg(0) | |||
| 37448 | .addMBB(restoreMBB) | |||
| 37449 | .addReg(0); | |||
| 37450 | } else { | |||
| 37451 | const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII); | |||
| 37452 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg) | |||
| 37453 | .addReg(XII->getGlobalBaseReg(MF)) | |||
| 37454 | .addImm(0) | |||
| 37455 | .addReg(0) | |||
| 37456 | .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference()) | |||
| 37457 | .addReg(0); | |||
| 37458 | } | |||
| 37459 | } else | |||
| 37460 | PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; | |||
| 37461 | // Store IP | |||
| 37462 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc)); | |||
| 37463 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37464 | if (i == X86::AddrDisp) | |||
| 37465 | MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset); | |||
| 37466 | else | |||
| 37467 | MIB.add(MI.getOperand(MemOpndSlot + i)); | |||
| 37468 | } | |||
| 37469 | if (!UseImmLabel) | |||
| 37470 | MIB.addReg(LabelReg); | |||
| 37471 | else | |||
| 37472 | MIB.addMBB(restoreMBB); | |||
| 37473 | MIB.setMemRefs(MMOs); | |||
| 37474 | ||||
| 37475 | if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) { | |||
| 37476 | emitSetJmpShadowStackFix(MI, thisMBB); | |||
| 37477 | } | |||
| 37478 | ||||
| 37479 | // Setup | |||
| 37480 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup)) | |||
| 37481 | .addMBB(restoreMBB); | |||
| 37482 | ||||
| 37483 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 37484 | MIB.addRegMask(RegInfo->getNoPreservedMask()); | |||
| 37485 | thisMBB->addSuccessor(mainMBB); | |||
| 37486 | thisMBB->addSuccessor(restoreMBB); | |||
| 37487 | ||||
| 37488 | // mainMBB: | |||
| 37489 | // EAX = 0 | |||
| 37490 | BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg); | |||
| 37491 | mainMBB->addSuccessor(sinkMBB); | |||
| 37492 | ||||
| 37493 | // sinkMBB: | |||
| 37494 | BuildMI(*sinkMBB, sinkMBB->begin(), DL, | |||
| 37495 | TII->get(X86::PHI), DstReg) | |||
| 37496 | .addReg(mainDstReg).addMBB(mainMBB) | |||
| 37497 | .addReg(restoreDstReg).addMBB(restoreMBB); | |||
| 37498 | ||||
| 37499 | // restoreMBB: | |||
| 37500 | if (RegInfo->hasBasePointer(*MF)) { | |||
| 37501 | const bool Uses64BitFramePtr = | |||
| 37502 | Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); | |||
| 37503 | X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>(); | |||
| 37504 | X86FI->setRestoreBasePointer(MF); | |||
| 37505 | Register FramePtr = RegInfo->getFrameRegister(*MF); | |||
| 37506 | Register BasePtr = RegInfo->getBaseRegister(); | |||
| 37507 | unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm; | |||
| 37508 | addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr), | |||
| 37509 | FramePtr, true, X86FI->getRestoreBasePointerOffset()) | |||
| 37510 | .setMIFlag(MachineInstr::FrameSetup); | |||
| 37511 | } | |||
| 37512 | BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1); | |||
| 37513 | BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB); | |||
| 37514 | restoreMBB->addSuccessor(sinkMBB); | |||
| 37515 | ||||
| 37516 | MI.eraseFromParent(); | |||
| 37517 | return sinkMBB; | |||
| 37518 | } | |||
| 37519 | ||||
| 37520 | /// Fix the shadow stack using the previously saved SSP pointer. | |||
| 37521 | /// \sa emitSetJmpShadowStackFix | |||
| 37522 | /// \param [in] MI The temporary Machine Instruction for the builtin. | |||
| 37523 | /// \param [in] MBB The Machine Basic Block that will be modified. | |||
| 37524 | /// \return The sink MBB that will perform the future indirect branch. | |||
| 37525 | MachineBasicBlock * | |||
| 37526 | X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI, | |||
| 37527 | MachineBasicBlock *MBB) const { | |||
| 37528 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37529 | MachineFunction *MF = MBB->getParent(); | |||
| 37530 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37531 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 37532 | ||||
| 37533 | // Memory Reference | |||
| 37534 | SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(), | |||
| 37535 | MI.memoperands_end()); | |||
| 37536 | ||||
| 37537 | MVT PVT = getPointerTy(MF->getDataLayout()); | |||
| 37538 | const TargetRegisterClass *PtrRC = getRegClassFor(PVT); | |||
| 37539 | ||||
| 37540 | // checkSspMBB: | |||
| 37541 | // xor vreg1, vreg1 | |||
| 37542 | // rdssp vreg1 | |||
| 37543 | // test vreg1, vreg1 | |||
| 37544 | // je sinkMBB # Jump if Shadow Stack is not supported | |||
| 37545 | // fallMBB: | |||
| 37546 | // mov buf+24/12(%rip), vreg2 | |||
| 37547 | // sub vreg1, vreg2 | |||
| 37548 | // jbe sinkMBB # No need to fix the Shadow Stack | |||
| 37549 | // fixShadowMBB: | |||
| 37550 | // shr 3/2, vreg2 | |||
| 37551 | // incssp vreg2 # fix the SSP according to the lower 8 bits | |||
| 37552 | // shr 8, vreg2 | |||
| 37553 | // je sinkMBB | |||
| 37554 | // fixShadowLoopPrepareMBB: | |||
| 37555 | // shl vreg2 | |||
| 37556 | // mov 128, vreg3 | |||
| 37557 | // fixShadowLoopMBB: | |||
| 37558 | // incssp vreg3 | |||
| 37559 | // dec vreg2 | |||
| 37560 | // jne fixShadowLoopMBB # Iterate until you finish fixing | |||
| 37561 | // # the Shadow Stack | |||
| 37562 | // sinkMBB: | |||
| 37563 | ||||
| 37564 | MachineFunction::iterator I = ++MBB->getIterator(); | |||
| 37565 | const BasicBlock *BB = MBB->getBasicBlock(); | |||
| 37566 | ||||
| 37567 | MachineBasicBlock *checkSspMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37568 | MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37569 | MachineBasicBlock *fixShadowMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37570 | MachineBasicBlock *fixShadowLoopPrepareMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37571 | MachineBasicBlock *fixShadowLoopMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37572 | MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB); | |||
| 37573 | MF->insert(I, checkSspMBB); | |||
| 37574 | MF->insert(I, fallMBB); | |||
| 37575 | MF->insert(I, fixShadowMBB); | |||
| 37576 | MF->insert(I, fixShadowLoopPrepareMBB); | |||
| 37577 | MF->insert(I, fixShadowLoopMBB); | |||
| 37578 | MF->insert(I, sinkMBB); | |||
| 37579 | ||||
| 37580 | // Transfer the remainder of BB and its successor edges to sinkMBB. | |||
| 37581 | sinkMBB->splice(sinkMBB->begin(), MBB, MachineBasicBlock::iterator(MI), | |||
| 37582 | MBB->end()); | |||
| 37583 | sinkMBB->transferSuccessorsAndUpdatePHIs(MBB); | |||
| 37584 | ||||
| 37585 | MBB->addSuccessor(checkSspMBB); | |||
| 37586 | ||||
| 37587 | // Initialize a register with zero. | |||
| 37588 | Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass); | |||
| 37589 | BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg); | |||
| 37590 | ||||
| 37591 | if (PVT == MVT::i64) { | |||
| 37592 | Register TmpZReg = MRI.createVirtualRegister(PtrRC); | |||
| 37593 | BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg) | |||
| 37594 | .addImm(0) | |||
| 37595 | .addReg(ZReg) | |||
| 37596 | .addImm(X86::sub_32bit); | |||
| 37597 | ZReg = TmpZReg; | |||
| 37598 | } | |||
| 37599 | ||||
| 37600 | // Read the current SSP Register value to the zeroed register. | |||
| 37601 | Register SSPCopyReg = MRI.createVirtualRegister(PtrRC); | |||
| 37602 | unsigned RdsspOpc = (PVT == MVT::i64) ? X86::RDSSPQ : X86::RDSSPD; | |||
| 37603 | BuildMI(checkSspMBB, DL, TII->get(RdsspOpc), SSPCopyReg).addReg(ZReg); | |||
| 37604 | ||||
| 37605 | // Check whether the result of the SSP register is zero and jump directly | |||
| 37606 | // to the sink. | |||
| 37607 | unsigned TestRROpc = (PVT == MVT::i64) ? X86::TEST64rr : X86::TEST32rr; | |||
| 37608 | BuildMI(checkSspMBB, DL, TII->get(TestRROpc)) | |||
| 37609 | .addReg(SSPCopyReg) | |||
| 37610 | .addReg(SSPCopyReg); | |||
| 37611 | BuildMI(checkSspMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); | |||
| 37612 | checkSspMBB->addSuccessor(sinkMBB); | |||
| 37613 | checkSspMBB->addSuccessor(fallMBB); | |||
| 37614 | ||||
| 37615 | // Reload the previously saved SSP register value. | |||
| 37616 | Register PrevSSPReg = MRI.createVirtualRegister(PtrRC); | |||
| 37617 | unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; | |||
| 37618 | const int64_t SPPOffset = 3 * PVT.getStoreSize(); | |||
| 37619 | MachineInstrBuilder MIB = | |||
| 37620 | BuildMI(fallMBB, DL, TII->get(PtrLoadOpc), PrevSSPReg); | |||
| 37621 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37622 | const MachineOperand &MO = MI.getOperand(i); | |||
| 37623 | if (i == X86::AddrDisp) | |||
| 37624 | MIB.addDisp(MO, SPPOffset); | |||
| 37625 | else if (MO.isReg()) // Don't add the whole operand, we don't want to | |||
| 37626 | // preserve kill flags. | |||
| 37627 | MIB.addReg(MO.getReg()); | |||
| 37628 | else | |||
| 37629 | MIB.add(MO); | |||
| 37630 | } | |||
| 37631 | MIB.setMemRefs(MMOs); | |||
| 37632 | ||||
| 37633 | // Subtract the current SSP from the previous SSP. | |||
| 37634 | Register SspSubReg = MRI.createVirtualRegister(PtrRC); | |||
| 37635 | unsigned SubRROpc = (PVT == MVT::i64) ? X86::SUB64rr : X86::SUB32rr; | |||
| 37636 | BuildMI(fallMBB, DL, TII->get(SubRROpc), SspSubReg) | |||
| 37637 | .addReg(PrevSSPReg) | |||
| 37638 | .addReg(SSPCopyReg); | |||
| 37639 | ||||
| 37640 | // Jump to sink in case PrevSSPReg <= SSPCopyReg. | |||
| 37641 | BuildMI(fallMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_BE); | |||
| 37642 | fallMBB->addSuccessor(sinkMBB); | |||
| 37643 | fallMBB->addSuccessor(fixShadowMBB); | |||
| 37644 | ||||
| 37645 | // Shift right by 2/3 for 32/64 because incssp multiplies the argument by 4/8. | |||
| 37646 | unsigned ShrRIOpc = (PVT == MVT::i64) ? X86::SHR64ri : X86::SHR32ri; | |||
| 37647 | unsigned Offset = (PVT == MVT::i64) ? 3 : 2; | |||
| 37648 | Register SspFirstShrReg = MRI.createVirtualRegister(PtrRC); | |||
| 37649 | BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspFirstShrReg) | |||
| 37650 | .addReg(SspSubReg) | |||
| 37651 | .addImm(Offset); | |||
| 37652 | ||||
| 37653 | // Increase SSP when looking only on the lower 8 bits of the delta. | |||
| 37654 | unsigned IncsspOpc = (PVT == MVT::i64) ? X86::INCSSPQ : X86::INCSSPD; | |||
| 37655 | BuildMI(fixShadowMBB, DL, TII->get(IncsspOpc)).addReg(SspFirstShrReg); | |||
| 37656 | ||||
| 37657 | // Reset the lower 8 bits. | |||
| 37658 | Register SspSecondShrReg = MRI.createVirtualRegister(PtrRC); | |||
| 37659 | BuildMI(fixShadowMBB, DL, TII->get(ShrRIOpc), SspSecondShrReg) | |||
| 37660 | .addReg(SspFirstShrReg) | |||
| 37661 | .addImm(8); | |||
| 37662 | ||||
| 37663 | // Jump if the result of the shift is zero. | |||
| 37664 | BuildMI(fixShadowMBB, DL, TII->get(X86::JCC_1)).addMBB(sinkMBB).addImm(X86::COND_E); | |||
| 37665 | fixShadowMBB->addSuccessor(sinkMBB); | |||
| 37666 | fixShadowMBB->addSuccessor(fixShadowLoopPrepareMBB); | |||
| 37667 | ||||
| 37668 | // Do a single shift left. | |||
| 37669 | unsigned ShlR1Opc = (PVT == MVT::i64) ? X86::SHL64r1 : X86::SHL32r1; | |||
| 37670 | Register SspAfterShlReg = MRI.createVirtualRegister(PtrRC); | |||
| 37671 | BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(ShlR1Opc), SspAfterShlReg) | |||
| 37672 | .addReg(SspSecondShrReg); | |||
| 37673 | ||||
| 37674 | // Save the value 128 to a register (will be used next with incssp). | |||
| 37675 | Register Value128InReg = MRI.createVirtualRegister(PtrRC); | |||
| 37676 | unsigned MovRIOpc = (PVT == MVT::i64) ? X86::MOV64ri32 : X86::MOV32ri; | |||
| 37677 | BuildMI(fixShadowLoopPrepareMBB, DL, TII->get(MovRIOpc), Value128InReg) | |||
| 37678 | .addImm(128); | |||
| 37679 | fixShadowLoopPrepareMBB->addSuccessor(fixShadowLoopMBB); | |||
| 37680 | ||||
| 37681 | // Since incssp only looks at the lower 8 bits, we might need to do several | |||
| 37682 | // iterations of incssp until we finish fixing the shadow stack. | |||
| 37683 | Register DecReg = MRI.createVirtualRegister(PtrRC); | |||
| 37684 | Register CounterReg = MRI.createVirtualRegister(PtrRC); | |||
| 37685 | BuildMI(fixShadowLoopMBB, DL, TII->get(X86::PHI), CounterReg) | |||
| 37686 | .addReg(SspAfterShlReg) | |||
| 37687 | .addMBB(fixShadowLoopPrepareMBB) | |||
| 37688 | .addReg(DecReg) | |||
| 37689 | .addMBB(fixShadowLoopMBB); | |||
| 37690 | ||||
| 37691 | // Every iteration we increase the SSP by 128. | |||
| 37692 | BuildMI(fixShadowLoopMBB, DL, TII->get(IncsspOpc)).addReg(Value128InReg); | |||
| 37693 | ||||
| 37694 | // Every iteration we decrement the counter by 1. | |||
| 37695 | unsigned DecROpc = (PVT == MVT::i64) ? X86::DEC64r : X86::DEC32r; | |||
| 37696 | BuildMI(fixShadowLoopMBB, DL, TII->get(DecROpc), DecReg).addReg(CounterReg); | |||
| 37697 | ||||
| 37698 | // Jump if the counter is not zero yet. | |||
| 37699 | BuildMI(fixShadowLoopMBB, DL, TII->get(X86::JCC_1)).addMBB(fixShadowLoopMBB).addImm(X86::COND_NE); | |||
| 37700 | fixShadowLoopMBB->addSuccessor(sinkMBB); | |||
| 37701 | fixShadowLoopMBB->addSuccessor(fixShadowLoopMBB); | |||
| 37702 | ||||
| 37703 | return sinkMBB; | |||
| 37704 | } | |||
| 37705 | ||||
| 37706 | MachineBasicBlock * | |||
| 37707 | X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI, | |||
| 37708 | MachineBasicBlock *MBB) const { | |||
| 37709 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37710 | MachineFunction *MF = MBB->getParent(); | |||
| 37711 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37712 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 37713 | ||||
| 37714 | // Memory Reference | |||
| 37715 | SmallVector<MachineMemOperand *, 2> MMOs(MI.memoperands_begin(), | |||
| 37716 | MI.memoperands_end()); | |||
| 37717 | ||||
| 37718 | MVT PVT = getPointerTy(MF->getDataLayout()); | |||
| 37719 | assert((PVT == MVT::i64 || PVT == MVT::i32) &&(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32 ) && "Invalid Pointer Size!") ? void (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__ __PRETTY_FUNCTION__)) | |||
| 37720 | "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32 ) && "Invalid Pointer Size!") ? void (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37720, __extension__ __PRETTY_FUNCTION__)); | |||
| 37721 | ||||
| 37722 | const TargetRegisterClass *RC = | |||
| 37723 | (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; | |||
| 37724 | Register Tmp = MRI.createVirtualRegister(RC); | |||
| 37725 | // Since FP is only updated here but NOT referenced, it's treated as GPR. | |||
| 37726 | const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); | |||
| 37727 | Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP; | |||
| 37728 | Register SP = RegInfo->getStackRegister(); | |||
| 37729 | ||||
| 37730 | MachineInstrBuilder MIB; | |||
| 37731 | ||||
| 37732 | const int64_t LabelOffset = 1 * PVT.getStoreSize(); | |||
| 37733 | const int64_t SPOffset = 2 * PVT.getStoreSize(); | |||
| 37734 | ||||
| 37735 | unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm; | |||
| 37736 | unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r; | |||
| 37737 | ||||
| 37738 | MachineBasicBlock *thisMBB = MBB; | |||
| 37739 | ||||
| 37740 | // When CET and shadow stack is enabled, we need to fix the Shadow Stack. | |||
| 37741 | if (MF->getMMI().getModule()->getModuleFlag("cf-protection-return")) { | |||
| 37742 | thisMBB = emitLongJmpShadowStackFix(MI, thisMBB); | |||
| 37743 | } | |||
| 37744 | ||||
| 37745 | // Reload FP | |||
| 37746 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), FP); | |||
| 37747 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37748 | const MachineOperand &MO = MI.getOperand(i); | |||
| 37749 | if (MO.isReg()) // Don't add the whole operand, we don't want to | |||
| 37750 | // preserve kill flags. | |||
| 37751 | MIB.addReg(MO.getReg()); | |||
| 37752 | else | |||
| 37753 | MIB.add(MO); | |||
| 37754 | } | |||
| 37755 | MIB.setMemRefs(MMOs); | |||
| 37756 | ||||
| 37757 | // Reload IP | |||
| 37758 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), Tmp); | |||
| 37759 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37760 | const MachineOperand &MO = MI.getOperand(i); | |||
| 37761 | if (i == X86::AddrDisp) | |||
| 37762 | MIB.addDisp(MO, LabelOffset); | |||
| 37763 | else if (MO.isReg()) // Don't add the whole operand, we don't want to | |||
| 37764 | // preserve kill flags. | |||
| 37765 | MIB.addReg(MO.getReg()); | |||
| 37766 | else | |||
| 37767 | MIB.add(MO); | |||
| 37768 | } | |||
| 37769 | MIB.setMemRefs(MMOs); | |||
| 37770 | ||||
| 37771 | // Reload SP | |||
| 37772 | MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrLoadOpc), SP); | |||
| 37773 | for (unsigned i = 0; i < X86::AddrNumOperands; ++i) { | |||
| 37774 | if (i == X86::AddrDisp) | |||
| 37775 | MIB.addDisp(MI.getOperand(i), SPOffset); | |||
| 37776 | else | |||
| 37777 | MIB.add(MI.getOperand(i)); // We can preserve the kill flags here, it's | |||
| 37778 | // the last instruction of the expansion. | |||
| 37779 | } | |||
| 37780 | MIB.setMemRefs(MMOs); | |||
| 37781 | ||||
| 37782 | // Jump | |||
| 37783 | BuildMI(*thisMBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp); | |||
| 37784 | ||||
| 37785 | MI.eraseFromParent(); | |||
| 37786 | return thisMBB; | |||
| 37787 | } | |||
| 37788 | ||||
| 37789 | void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, | |||
| 37790 | MachineBasicBlock *MBB, | |||
| 37791 | MachineBasicBlock *DispatchBB, | |||
| 37792 | int FI) const { | |||
| 37793 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37794 | MachineFunction *MF = MBB->getParent(); | |||
| 37795 | MachineRegisterInfo *MRI = &MF->getRegInfo(); | |||
| 37796 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37797 | ||||
| 37798 | MVT PVT = getPointerTy(MF->getDataLayout()); | |||
| 37799 | assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!")(static_cast <bool> ((PVT == MVT::i64 || PVT == MVT::i32 ) && "Invalid Pointer Size!") ? void (0) : __assert_fail ("(PVT == MVT::i64 || PVT == MVT::i32) && \"Invalid Pointer Size!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37799, __extension__ __PRETTY_FUNCTION__)); | |||
| 37800 | ||||
| 37801 | unsigned Op = 0; | |||
| 37802 | unsigned VR = 0; | |||
| 37803 | ||||
| 37804 | bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) && | |||
| 37805 | !isPositionIndependent(); | |||
| 37806 | ||||
| 37807 | if (UseImmLabel) { | |||
| 37808 | Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi; | |||
| 37809 | } else { | |||
| 37810 | const TargetRegisterClass *TRC = | |||
| 37811 | (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass; | |||
| 37812 | VR = MRI->createVirtualRegister(TRC); | |||
| 37813 | Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr; | |||
| 37814 | ||||
| 37815 | if (Subtarget.is64Bit()) | |||
| 37816 | BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR) | |||
| 37817 | .addReg(X86::RIP) | |||
| 37818 | .addImm(1) | |||
| 37819 | .addReg(0) | |||
| 37820 | .addMBB(DispatchBB) | |||
| 37821 | .addReg(0); | |||
| 37822 | else | |||
| 37823 | BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR) | |||
| 37824 | .addReg(0) /* TII->getGlobalBaseReg(MF) */ | |||
| 37825 | .addImm(1) | |||
| 37826 | .addReg(0) | |||
| 37827 | .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference()) | |||
| 37828 | .addReg(0); | |||
| 37829 | } | |||
| 37830 | ||||
| 37831 | MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); | |||
| 37832 | addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36); | |||
| 37833 | if (UseImmLabel) | |||
| 37834 | MIB.addMBB(DispatchBB); | |||
| 37835 | else | |||
| 37836 | MIB.addReg(VR); | |||
| 37837 | } | |||
| 37838 | ||||
| 37839 | MachineBasicBlock * | |||
| 37840 | X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, | |||
| 37841 | MachineBasicBlock *BB) const { | |||
| 37842 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 37843 | MachineFunction *MF = BB->getParent(); | |||
| 37844 | MachineRegisterInfo *MRI = &MF->getRegInfo(); | |||
| 37845 | const X86InstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 37846 | int FI = MF->getFrameInfo().getFunctionContextIndex(); | |||
| 37847 | ||||
| 37848 | // Get a mapping of the call site numbers to all of the landing pads they're | |||
| 37849 | // associated with. | |||
| 37850 | DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad; | |||
| 37851 | unsigned MaxCSNum = 0; | |||
| 37852 | for (auto &MBB : *MF) { | |||
| 37853 | if (!MBB.isEHPad()) | |||
| 37854 | continue; | |||
| 37855 | ||||
| 37856 | MCSymbol *Sym = nullptr; | |||
| 37857 | for (const auto &MI : MBB) { | |||
| 37858 | if (MI.isDebugInstr()) | |||
| 37859 | continue; | |||
| 37860 | ||||
| 37861 | assert(MI.isEHLabel() && "expected EH_LABEL")(static_cast <bool> (MI.isEHLabel() && "expected EH_LABEL" ) ? void (0) : __assert_fail ("MI.isEHLabel() && \"expected EH_LABEL\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37861, __extension__ __PRETTY_FUNCTION__)); | |||
| 37862 | Sym = MI.getOperand(0).getMCSymbol(); | |||
| 37863 | break; | |||
| 37864 | } | |||
| 37865 | ||||
| 37866 | if (!MF->hasCallSiteLandingPad(Sym)) | |||
| 37867 | continue; | |||
| 37868 | ||||
| 37869 | for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) { | |||
| 37870 | CallSiteNumToLPad[CSI].push_back(&MBB); | |||
| 37871 | MaxCSNum = std::max(MaxCSNum, CSI); | |||
| 37872 | } | |||
| 37873 | } | |||
| 37874 | ||||
| 37875 | // Get an ordered list of the machine basic blocks for the jump table. | |||
| 37876 | std::vector<MachineBasicBlock *> LPadList; | |||
| 37877 | SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs; | |||
| 37878 | LPadList.reserve(CallSiteNumToLPad.size()); | |||
| 37879 | ||||
| 37880 | for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) { | |||
| 37881 | for (auto &LP : CallSiteNumToLPad[CSI]) { | |||
| 37882 | LPadList.push_back(LP); | |||
| 37883 | InvokeBBs.insert(LP->pred_begin(), LP->pred_end()); | |||
| 37884 | } | |||
| 37885 | } | |||
| 37886 | ||||
| 37887 | assert(!LPadList.empty() &&(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!" ) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__ __PRETTY_FUNCTION__)) | |||
| 37888 | "No landing pad destinations for the dispatch jump table!")(static_cast <bool> (!LPadList.empty() && "No landing pad destinations for the dispatch jump table!" ) ? void (0) : __assert_fail ("!LPadList.empty() && \"No landing pad destinations for the dispatch jump table!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37888, __extension__ __PRETTY_FUNCTION__)); | |||
| 37889 | ||||
| 37890 | // Create the MBBs for the dispatch code. | |||
| 37891 | ||||
| 37892 | // Shove the dispatch's address into the return slot in the function context. | |||
| 37893 | MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock(); | |||
| 37894 | DispatchBB->setIsEHPad(true); | |||
| 37895 | ||||
| 37896 | MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock(); | |||
| 37897 | BuildMI(TrapBB, DL, TII->get(X86::TRAP)); | |||
| 37898 | DispatchBB->addSuccessor(TrapBB); | |||
| 37899 | ||||
| 37900 | MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock(); | |||
| 37901 | DispatchBB->addSuccessor(DispContBB); | |||
| 37902 | ||||
| 37903 | // Insert MBBs. | |||
| 37904 | MF->push_back(DispatchBB); | |||
| 37905 | MF->push_back(DispContBB); | |||
| 37906 | MF->push_back(TrapBB); | |||
| 37907 | ||||
| 37908 | // Insert code into the entry block that creates and registers the function | |||
| 37909 | // context. | |||
| 37910 | SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); | |||
| 37911 | ||||
| 37912 | // Create the jump table and associated information | |||
| 37913 | unsigned JTE = getJumpTableEncoding(); | |||
| 37914 | MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); | |||
| 37915 | unsigned MJTI = JTI->createJumpTableIndex(LPadList); | |||
| 37916 | ||||
| 37917 | const X86RegisterInfo &RI = TII->getRegisterInfo(); | |||
| 37918 | // Add a register mask with no preserved registers. This results in all | |||
| 37919 | // registers being marked as clobbered. | |||
| 37920 | if (RI.hasBasePointer(*MF)) { | |||
| 37921 | const bool FPIs64Bit = | |||
| 37922 | Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64(); | |||
| 37923 | X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>(); | |||
| 37924 | MFI->setRestoreBasePointer(MF); | |||
| 37925 | ||||
| 37926 | Register FP = RI.getFrameRegister(*MF); | |||
| 37927 | Register BP = RI.getBaseRegister(); | |||
| 37928 | unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm; | |||
| 37929 | addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true, | |||
| 37930 | MFI->getRestoreBasePointerOffset()) | |||
| 37931 | .addRegMask(RI.getNoPreservedMask()); | |||
| 37932 | } else { | |||
| 37933 | BuildMI(DispatchBB, DL, TII->get(X86::NOOP)) | |||
| 37934 | .addRegMask(RI.getNoPreservedMask()); | |||
| 37935 | } | |||
| 37936 | ||||
| 37937 | // IReg is used as an index in a memory operand and therefore can't be SP | |||
| 37938 | Register IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); | |||
| 37939 | addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, | |||
| 37940 | Subtarget.is64Bit() ? 8 : 4); | |||
| 37941 | BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) | |||
| 37942 | .addReg(IReg) | |||
| 37943 | .addImm(LPadList.size()); | |||
| 37944 | BuildMI(DispatchBB, DL, TII->get(X86::JCC_1)).addMBB(TrapBB).addImm(X86::COND_AE); | |||
| 37945 | ||||
| 37946 | if (Subtarget.is64Bit()) { | |||
| 37947 | Register BReg = MRI->createVirtualRegister(&X86::GR64RegClass); | |||
| 37948 | Register IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); | |||
| 37949 | ||||
| 37950 | // leaq .LJTI0_0(%rip), BReg | |||
| 37951 | BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) | |||
| 37952 | .addReg(X86::RIP) | |||
| 37953 | .addImm(1) | |||
| 37954 | .addReg(0) | |||
| 37955 | .addJumpTableIndex(MJTI) | |||
| 37956 | .addReg(0); | |||
| 37957 | // movzx IReg64, IReg | |||
| 37958 | BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64) | |||
| 37959 | .addImm(0) | |||
| 37960 | .addReg(IReg) | |||
| 37961 | .addImm(X86::sub_32bit); | |||
| 37962 | ||||
| 37963 | switch (JTE) { | |||
| 37964 | case MachineJumpTableInfo::EK_BlockAddress: | |||
| 37965 | // jmpq *(BReg,IReg64,8) | |||
| 37966 | BuildMI(DispContBB, DL, TII->get(X86::JMP64m)) | |||
| 37967 | .addReg(BReg) | |||
| 37968 | .addImm(8) | |||
| 37969 | .addReg(IReg64) | |||
| 37970 | .addImm(0) | |||
| 37971 | .addReg(0); | |||
| 37972 | break; | |||
| 37973 | case MachineJumpTableInfo::EK_LabelDifference32: { | |||
| 37974 | Register OReg = MRI->createVirtualRegister(&X86::GR32RegClass); | |||
| 37975 | Register OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); | |||
| 37976 | Register TReg = MRI->createVirtualRegister(&X86::GR64RegClass); | |||
| 37977 | ||||
| 37978 | // movl (BReg,IReg64,4), OReg | |||
| 37979 | BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) | |||
| 37980 | .addReg(BReg) | |||
| 37981 | .addImm(4) | |||
| 37982 | .addReg(IReg64) | |||
| 37983 | .addImm(0) | |||
| 37984 | .addReg(0); | |||
| 37985 | // movsx OReg64, OReg | |||
| 37986 | BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg); | |||
| 37987 | // addq BReg, OReg64, TReg | |||
| 37988 | BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg) | |||
| 37989 | .addReg(OReg64) | |||
| 37990 | .addReg(BReg); | |||
| 37991 | // jmpq *TReg | |||
| 37992 | BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg); | |||
| 37993 | break; | |||
| 37994 | } | |||
| 37995 | default: | |||
| 37996 | llvm_unreachable("Unexpected jump table encoding")::llvm::llvm_unreachable_internal("Unexpected jump table encoding" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 37996); | |||
| 37997 | } | |||
| 37998 | } else { | |||
| 37999 | // jmpl *.LJTI0_0(,IReg,4) | |||
| 38000 | BuildMI(DispContBB, DL, TII->get(X86::JMP32m)) | |||
| 38001 | .addReg(0) | |||
| 38002 | .addImm(4) | |||
| 38003 | .addReg(IReg) | |||
| 38004 | .addJumpTableIndex(MJTI) | |||
| 38005 | .addReg(0); | |||
| 38006 | } | |||
| 38007 | ||||
| 38008 | // Add the jump table entries as successors to the MBB. | |||
| 38009 | SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs; | |||
| 38010 | for (auto &LP : LPadList) | |||
| 38011 | if (SeenMBBs.insert(LP).second) | |||
| 38012 | DispContBB->addSuccessor(LP); | |||
| 38013 | ||||
| 38014 | // N.B. the order the invoke BBs are processed in doesn't matter here. | |||
| 38015 | SmallVector<MachineBasicBlock *, 64> MBBLPads; | |||
| 38016 | const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs(); | |||
| 38017 | for (MachineBasicBlock *MBB : InvokeBBs) { | |||
| 38018 | // Remove the landing pad successor from the invoke block and replace it | |||
| 38019 | // with the new dispatch block. | |||
| 38020 | // Keep a copy of Successors since it's modified inside the loop. | |||
| 38021 | SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(), | |||
| 38022 | MBB->succ_rend()); | |||
| 38023 | // FIXME: Avoid quadratic complexity. | |||
| 38024 | for (auto *MBBS : Successors) { | |||
| 38025 | if (MBBS->isEHPad()) { | |||
| 38026 | MBB->removeSuccessor(MBBS); | |||
| 38027 | MBBLPads.push_back(MBBS); | |||
| 38028 | } | |||
| 38029 | } | |||
| 38030 | ||||
| 38031 | MBB->addSuccessor(DispatchBB); | |||
| 38032 | ||||
| 38033 | // Find the invoke call and mark all of the callee-saved registers as | |||
| 38034 | // 'implicit defined' so that they're spilled. This prevents code from | |||
| 38035 | // moving instructions to before the EH block, where they will never be | |||
| 38036 | // executed. | |||
| 38037 | for (auto &II : reverse(*MBB)) { | |||
| 38038 | if (!II.isCall()) | |||
| 38039 | continue; | |||
| 38040 | ||||
| 38041 | DenseMap<unsigned, bool> DefRegs; | |||
| 38042 | for (auto &MOp : II.operands()) | |||
| 38043 | if (MOp.isReg()) | |||
| 38044 | DefRegs[MOp.getReg()] = true; | |||
| 38045 | ||||
| 38046 | MachineInstrBuilder MIB(*MF, &II); | |||
| 38047 | for (unsigned RegIdx = 0; SavedRegs[RegIdx]; ++RegIdx) { | |||
| 38048 | unsigned Reg = SavedRegs[RegIdx]; | |||
| 38049 | if (!DefRegs[Reg]) | |||
| 38050 | MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead); | |||
| 38051 | } | |||
| 38052 | ||||
| 38053 | break; | |||
| 38054 | } | |||
| 38055 | } | |||
| 38056 | ||||
| 38057 | // Mark all former landing pads as non-landing pads. The dispatch is the only | |||
| 38058 | // landing pad now. | |||
| 38059 | for (auto &LP : MBBLPads) | |||
| 38060 | LP->setIsEHPad(false); | |||
| 38061 | ||||
| 38062 | // The instruction is gone now. | |||
| 38063 | MI.eraseFromParent(); | |||
| 38064 | return BB; | |||
| 38065 | } | |||
| 38066 | ||||
| 38067 | MachineBasicBlock * | |||
| 38068 | X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, | |||
| 38069 | MachineBasicBlock *BB) const { | |||
| 38070 | MachineFunction *MF = BB->getParent(); | |||
| 38071 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 38072 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 38073 | ||||
| 38074 | auto TMMImmToTMMReg = [](unsigned Imm) { | |||
| 38075 | assert (Imm < 8 && "Illegal tmm index")(static_cast <bool> (Imm < 8 && "Illegal tmm index" ) ? void (0) : __assert_fail ("Imm < 8 && \"Illegal tmm index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38075, __extension__ __PRETTY_FUNCTION__)); | |||
| 38076 | return X86::TMM0 + Imm; | |||
| 38077 | }; | |||
| 38078 | switch (MI.getOpcode()) { | |||
| 38079 | default: llvm_unreachable("Unexpected instr type to insert")::llvm::llvm_unreachable_internal("Unexpected instr type to insert" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38079); | |||
| 38080 | case X86::TLS_addr32: | |||
| 38081 | case X86::TLS_addr64: | |||
| 38082 | case X86::TLS_addrX32: | |||
| 38083 | case X86::TLS_base_addr32: | |||
| 38084 | case X86::TLS_base_addr64: | |||
| 38085 | case X86::TLS_base_addrX32: | |||
| 38086 | return EmitLoweredTLSAddr(MI, BB); | |||
| 38087 | case X86::INDIRECT_THUNK_CALL32: | |||
| 38088 | case X86::INDIRECT_THUNK_CALL64: | |||
| 38089 | case X86::INDIRECT_THUNK_TCRETURN32: | |||
| 38090 | case X86::INDIRECT_THUNK_TCRETURN64: | |||
| 38091 | return EmitLoweredIndirectThunk(MI, BB); | |||
| 38092 | case X86::CATCHRET: | |||
| 38093 | return EmitLoweredCatchRet(MI, BB); | |||
| 38094 | case X86::SEG_ALLOCA_32: | |||
| 38095 | case X86::SEG_ALLOCA_64: | |||
| 38096 | return EmitLoweredSegAlloca(MI, BB); | |||
| 38097 | case X86::PROBED_ALLOCA_32: | |||
| 38098 | case X86::PROBED_ALLOCA_64: | |||
| 38099 | return EmitLoweredProbedAlloca(MI, BB); | |||
| 38100 | case X86::TLSCall_32: | |||
| 38101 | case X86::TLSCall_64: | |||
| 38102 | return EmitLoweredTLSCall(MI, BB); | |||
| 38103 | case X86::CMOV_FR16: | |||
| 38104 | case X86::CMOV_FR16X: | |||
| 38105 | case X86::CMOV_FR32: | |||
| 38106 | case X86::CMOV_FR32X: | |||
| 38107 | case X86::CMOV_FR64: | |||
| 38108 | case X86::CMOV_FR64X: | |||
| 38109 | case X86::CMOV_GR8: | |||
| 38110 | case X86::CMOV_GR16: | |||
| 38111 | case X86::CMOV_GR32: | |||
| 38112 | case X86::CMOV_RFP32: | |||
| 38113 | case X86::CMOV_RFP64: | |||
| 38114 | case X86::CMOV_RFP80: | |||
| 38115 | case X86::CMOV_VR64: | |||
| 38116 | case X86::CMOV_VR128: | |||
| 38117 | case X86::CMOV_VR128X: | |||
| 38118 | case X86::CMOV_VR256: | |||
| 38119 | case X86::CMOV_VR256X: | |||
| 38120 | case X86::CMOV_VR512: | |||
| 38121 | case X86::CMOV_VK1: | |||
| 38122 | case X86::CMOV_VK2: | |||
| 38123 | case X86::CMOV_VK4: | |||
| 38124 | case X86::CMOV_VK8: | |||
| 38125 | case X86::CMOV_VK16: | |||
| 38126 | case X86::CMOV_VK32: | |||
| 38127 | case X86::CMOV_VK64: | |||
| 38128 | return EmitLoweredSelect(MI, BB); | |||
| 38129 | ||||
| 38130 | case X86::FP80_ADDr: | |||
| 38131 | case X86::FP80_ADDm32: { | |||
| 38132 | // Change the floating point control register to use double extended | |||
| 38133 | // precision when performing the addition. | |||
| 38134 | int OrigCWFrameIdx = | |||
| 38135 | MF->getFrameInfo().CreateStackObject(2, Align(2), false); | |||
| 38136 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FNSTCW16m)), | |||
| 38137 | OrigCWFrameIdx); | |||
| 38138 | ||||
| 38139 | // Load the old value of the control word... | |||
| 38140 | Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); | |||
| 38141 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), | |||
| 38142 | OrigCWFrameIdx); | |||
| 38143 | ||||
| 38144 | // OR 0b11 into bit 8 and 9. 0b11 is the encoding for double extended | |||
| 38145 | // precision. | |||
| 38146 | Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); | |||
| 38147 | BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) | |||
| 38148 | .addReg(OldCW, RegState::Kill) | |||
| 38149 | .addImm(0x300); | |||
| 38150 | ||||
| 38151 | // Extract to 16 bits. | |||
| 38152 | Register NewCW16 = | |||
| 38153 | MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); | |||
| 38154 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) | |||
| 38155 | .addReg(NewCW, RegState::Kill, X86::sub_16bit); | |||
| 38156 | ||||
| 38157 | // Prepare memory for FLDCW. | |||
| 38158 | int NewCWFrameIdx = | |||
| 38159 | MF->getFrameInfo().CreateStackObject(2, Align(2), false); | |||
| 38160 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), | |||
| 38161 | NewCWFrameIdx) | |||
| 38162 | .addReg(NewCW16, RegState::Kill); | |||
| 38163 | ||||
| 38164 | // Reload the modified control word now... | |||
| 38165 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), | |||
| 38166 | NewCWFrameIdx); | |||
| 38167 | ||||
| 38168 | // Do the addition. | |||
| 38169 | if (MI.getOpcode() == X86::FP80_ADDr) { | |||
| 38170 | BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80)) | |||
| 38171 | .add(MI.getOperand(0)) | |||
| 38172 | .add(MI.getOperand(1)) | |||
| 38173 | .add(MI.getOperand(2)); | |||
| 38174 | } else { | |||
| 38175 | BuildMI(*BB, MI, DL, TII->get(X86::ADD_Fp80m32)) | |||
| 38176 | .add(MI.getOperand(0)) | |||
| 38177 | .add(MI.getOperand(1)) | |||
| 38178 | .add(MI.getOperand(2)) | |||
| 38179 | .add(MI.getOperand(3)) | |||
| 38180 | .add(MI.getOperand(4)) | |||
| 38181 | .add(MI.getOperand(5)) | |||
| 38182 | .add(MI.getOperand(6)); | |||
| 38183 | } | |||
| 38184 | ||||
| 38185 | // Reload the original control word now. | |||
| 38186 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::FLDCW16m)), | |||
| 38187 | OrigCWFrameIdx); | |||
| 38188 | ||||
| 38189 | MI.eraseFromParent(); // The pseudo instruction is gone now. | |||
| 38190 | return BB; | |||
| 38191 | } | |||
| 38192 | ||||
| 38193 | case X86::FP32_TO_INT16_IN_MEM: | |||
| 38194 | case X86::FP32_TO_INT32_IN_MEM: | |||
| 38195 | case X86::FP32_TO_INT64_IN_MEM: | |||
| 38196 | case X86::FP64_TO_INT16_IN_MEM: | |||
| 38197 | case X86::FP64_TO_INT32_IN_MEM: | |||
| 38198 | case X86::FP64_TO_INT64_IN_MEM: | |||
| 38199 | case X86::FP80_TO_INT16_IN_MEM: | |||
| 38200 | case X86::FP80_TO_INT32_IN_MEM: | |||
| 38201 | case X86::FP80_TO_INT64_IN_MEM: { | |||
| 38202 | // Change the floating point control register to use "round towards zero" | |||
| 38203 | // mode when truncating to an integer value. | |||
| 38204 | int OrigCWFrameIdx = | |||
| 38205 | MF->getFrameInfo().CreateStackObject(2, Align(2), false); | |||
| 38206 | addFrameReference(BuildMI(*BB, MI, DL, | |||
| 38207 | TII->get(X86::FNSTCW16m)), OrigCWFrameIdx); | |||
| 38208 | ||||
| 38209 | // Load the old value of the control word... | |||
| 38210 | Register OldCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); | |||
| 38211 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOVZX32rm16), OldCW), | |||
| 38212 | OrigCWFrameIdx); | |||
| 38213 | ||||
| 38214 | // OR 0b11 into bit 10 and 11. 0b11 is the encoding for round toward zero. | |||
| 38215 | Register NewCW = MF->getRegInfo().createVirtualRegister(&X86::GR32RegClass); | |||
| 38216 | BuildMI(*BB, MI, DL, TII->get(X86::OR32ri), NewCW) | |||
| 38217 | .addReg(OldCW, RegState::Kill).addImm(0xC00); | |||
| 38218 | ||||
| 38219 | // Extract to 16 bits. | |||
| 38220 | Register NewCW16 = | |||
| 38221 | MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass); | |||
| 38222 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), NewCW16) | |||
| 38223 | .addReg(NewCW, RegState::Kill, X86::sub_16bit); | |||
| 38224 | ||||
| 38225 | // Prepare memory for FLDCW. | |||
| 38226 | int NewCWFrameIdx = | |||
| 38227 | MF->getFrameInfo().CreateStackObject(2, Align(2), false); | |||
| 38228 | addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), | |||
| 38229 | NewCWFrameIdx) | |||
| 38230 | .addReg(NewCW16, RegState::Kill); | |||
| 38231 | ||||
| 38232 | // Reload the modified control word now... | |||
| 38233 | addFrameReference(BuildMI(*BB, MI, DL, | |||
| 38234 | TII->get(X86::FLDCW16m)), NewCWFrameIdx); | |||
| 38235 | ||||
| 38236 | // Get the X86 opcode to use. | |||
| 38237 | unsigned Opc; | |||
| 38238 | switch (MI.getOpcode()) { | |||
| 38239 | default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 38239); | |||
| 38240 | case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break; | |||
| 38241 | case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break; | |||
| 38242 | case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break; | |||
| 38243 | case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break; | |||
| 38244 | case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break; | |||
| 38245 | case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break; | |||
| 38246 | case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break; | |||
| 38247 | case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break; | |||
| 38248 | case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break; | |||
| 38249 | } | |||
| 38250 | ||||
| 38251 | X86AddressMode AM = getAddressFromInstr(&MI, 0); | |||
| 38252 | addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM) | |||
| 38253 | .addReg(MI.getOperand(X86::AddrNumOperands).getReg()); | |||
| 38254 | ||||
| 38255 | // Reload the original control word now. | |||
| 38256 | addFrameReference(BuildMI(*BB, MI, DL, | |||
| 38257 | TII->get(X86::FLDCW16m)), OrigCWFrameIdx); | |||
| 38258 | ||||
| 38259 | MI.eraseFromParent(); // The pseudo instruction is gone now. | |||
| 38260 | return BB; | |||
| 38261 | } | |||
| 38262 | ||||
| 38263 | // xbegin | |||
| 38264 | case X86::XBEGIN: | |||
| 38265 | return emitXBegin(MI, BB, Subtarget.getInstrInfo()); | |||
| 38266 | ||||
| 38267 | case X86::VAARG_64: | |||
| 38268 | case X86::VAARG_X32: | |||
| 38269 | return EmitVAARGWithCustomInserter(MI, BB); | |||
| 38270 | ||||
| 38271 | case X86::EH_SjLj_SetJmp32: | |||
| 38272 | case X86::EH_SjLj_SetJmp64: | |||
| 38273 | return emitEHSjLjSetJmp(MI, BB); | |||
| 38274 | ||||
| 38275 | case X86::EH_SjLj_LongJmp32: | |||
| 38276 | case X86::EH_SjLj_LongJmp64: | |||
| 38277 | return emitEHSjLjLongJmp(MI, BB); | |||
| 38278 | ||||
| 38279 | case X86::Int_eh_sjlj_setup_dispatch: | |||
| 38280 | return EmitSjLjDispatchBlock(MI, BB); | |||
| 38281 | ||||
| 38282 | case TargetOpcode::STATEPOINT: | |||
| 38283 | // As an implementation detail, STATEPOINT shares the STACKMAP format at | |||
| 38284 | // this point in the process. We diverge later. | |||
| 38285 | return emitPatchPoint(MI, BB); | |||
| 38286 | ||||
| 38287 | case TargetOpcode::STACKMAP: | |||
| 38288 | case TargetOpcode::PATCHPOINT: | |||
| 38289 | return emitPatchPoint(MI, BB); | |||
| 38290 | ||||
| 38291 | case TargetOpcode::PATCHABLE_EVENT_CALL: | |||
| 38292 | case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL: | |||
| 38293 | return BB; | |||
| 38294 | ||||
| 38295 | case X86::LCMPXCHG8B: { | |||
| 38296 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 38297 | // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B | |||
| 38298 | // requires a memory operand. If it happens that current architecture is | |||
| 38299 | // i686 and for current function we need a base pointer | |||
| 38300 | // - which is ESI for i686 - register allocator would not be able to | |||
| 38301 | // allocate registers for an address in form of X(%reg, %reg, Y) | |||
| 38302 | // - there never would be enough unreserved registers during regalloc | |||
| 38303 | // (without the need for base ptr the only option would be X(%edi, %esi, Y). | |||
| 38304 | // We are giving a hand to register allocator by precomputing the address in | |||
| 38305 | // a new vreg using LEA. | |||
| 38306 | ||||
| 38307 | // If it is not i686 or there is no base pointer - nothing to do here. | |||
| 38308 | if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF)) | |||
| 38309 | return BB; | |||
| 38310 | ||||
| 38311 | // Even though this code does not necessarily needs the base pointer to | |||
| 38312 | // be ESI, we check for that. The reason: if this assert fails, there are | |||
| 38313 | // some changes happened in the compiler base pointer handling, which most | |||
| 38314 | // probably have to be addressed somehow here. | |||
| 38315 | assert(TRI->getBaseRegister() == X86::ESI &&(static_cast <bool> (TRI->getBaseRegister() == X86:: ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__ __PRETTY_FUNCTION__)) | |||
| 38316 | "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "(static_cast <bool> (TRI->getBaseRegister() == X86:: ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__ __PRETTY_FUNCTION__)) | |||
| 38317 | "base pointer in mind")(static_cast <bool> (TRI->getBaseRegister() == X86:: ESI && "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a " "base pointer in mind") ? void (0) : __assert_fail ("TRI->getBaseRegister() == X86::ESI && \"LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a \" \"base pointer in mind\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38317, __extension__ __PRETTY_FUNCTION__)); | |||
| 38318 | ||||
| 38319 | MachineRegisterInfo &MRI = MF->getRegInfo(); | |||
| 38320 | MVT SPTy = getPointerTy(MF->getDataLayout()); | |||
| 38321 | const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy); | |||
| 38322 | Register computedAddrVReg = MRI.createVirtualRegister(AddrRegClass); | |||
| 38323 | ||||
| 38324 | X86AddressMode AM = getAddressFromInstr(&MI, 0); | |||
| 38325 | // Regalloc does not need any help when the memory operand of CMPXCHG8B | |||
| 38326 | // does not use index register. | |||
| 38327 | if (AM.IndexReg == X86::NoRegister) | |||
| 38328 | return BB; | |||
| 38329 | ||||
| 38330 | // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its | |||
| 38331 | // four operand definitions that are E[ABCD] registers. We skip them and | |||
| 38332 | // then insert the LEA. | |||
| 38333 | MachineBasicBlock::reverse_iterator RMBBI(MI.getReverseIterator()); | |||
| 38334 | while (RMBBI != BB->rend() && (RMBBI->definesRegister(X86::EAX) || | |||
| 38335 | RMBBI->definesRegister(X86::EBX) || | |||
| 38336 | RMBBI->definesRegister(X86::ECX) || | |||
| 38337 | RMBBI->definesRegister(X86::EDX))) { | |||
| 38338 | ++RMBBI; | |||
| 38339 | } | |||
| 38340 | MachineBasicBlock::iterator MBBI(RMBBI); | |||
| 38341 | addFullAddress( | |||
| 38342 | BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM); | |||
| 38343 | ||||
| 38344 | setDirectAddressInInstr(&MI, 0, computedAddrVReg); | |||
| 38345 | ||||
| 38346 | return BB; | |||
| 38347 | } | |||
| 38348 | case X86::LCMPXCHG16B_NO_RBX: { | |||
| 38349 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 38350 | Register BasePtr = TRI->getBaseRegister(); | |||
| 38351 | if (TRI->hasBasePointer(*MF) && | |||
| 38352 | (BasePtr == X86::RBX || BasePtr == X86::EBX)) { | |||
| 38353 | if (!BB->isLiveIn(BasePtr)) | |||
| 38354 | BB->addLiveIn(BasePtr); | |||
| 38355 | // Save RBX into a virtual register. | |||
| 38356 | Register SaveRBX = | |||
| 38357 | MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); | |||
| 38358 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) | |||
| 38359 | .addReg(X86::RBX); | |||
| 38360 | Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); | |||
| 38361 | MachineInstrBuilder MIB = | |||
| 38362 | BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B_SAVE_RBX), Dst); | |||
| 38363 | for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) | |||
| 38364 | MIB.add(MI.getOperand(Idx)); | |||
| 38365 | MIB.add(MI.getOperand(X86::AddrNumOperands)); | |||
| 38366 | MIB.addReg(SaveRBX); | |||
| 38367 | } else { | |||
| 38368 | // Simple case, just copy the virtual register to RBX. | |||
| 38369 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::RBX) | |||
| 38370 | .add(MI.getOperand(X86::AddrNumOperands)); | |||
| 38371 | MachineInstrBuilder MIB = | |||
| 38372 | BuildMI(*BB, MI, DL, TII->get(X86::LCMPXCHG16B)); | |||
| 38373 | for (unsigned Idx = 0; Idx < X86::AddrNumOperands; ++Idx) | |||
| 38374 | MIB.add(MI.getOperand(Idx)); | |||
| 38375 | } | |||
| 38376 | MI.eraseFromParent(); | |||
| 38377 | return BB; | |||
| 38378 | } | |||
| 38379 | case X86::MWAITX: { | |||
| 38380 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 38381 | Register BasePtr = TRI->getBaseRegister(); | |||
| 38382 | bool IsRBX = (BasePtr == X86::RBX || BasePtr == X86::EBX); | |||
| 38383 | // If no need to save the base pointer, we generate MWAITXrrr, | |||
| 38384 | // else we generate pseudo MWAITX_SAVE_RBX. | |||
| 38385 | if (!IsRBX || !TRI->hasBasePointer(*MF)) { | |||
| 38386 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) | |||
| 38387 | .addReg(MI.getOperand(0).getReg()); | |||
| 38388 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) | |||
| 38389 | .addReg(MI.getOperand(1).getReg()); | |||
| 38390 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EBX) | |||
| 38391 | .addReg(MI.getOperand(2).getReg()); | |||
| 38392 | BuildMI(*BB, MI, DL, TII->get(X86::MWAITXrrr)); | |||
| 38393 | MI.eraseFromParent(); | |||
| 38394 | } else { | |||
| 38395 | if (!BB->isLiveIn(BasePtr)) { | |||
| 38396 | BB->addLiveIn(BasePtr); | |||
| 38397 | } | |||
| 38398 | // Parameters can be copied into ECX and EAX but not EBX yet. | |||
| 38399 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::ECX) | |||
| 38400 | .addReg(MI.getOperand(0).getReg()); | |||
| 38401 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), X86::EAX) | |||
| 38402 | .addReg(MI.getOperand(1).getReg()); | |||
| 38403 | assert(Subtarget.is64Bit() && "Expected 64-bit mode!")(static_cast <bool> (Subtarget.is64Bit() && "Expected 64-bit mode!" ) ? void (0) : __assert_fail ("Subtarget.is64Bit() && \"Expected 64-bit mode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38403, __extension__ __PRETTY_FUNCTION__)); | |||
| 38404 | // Save RBX into a virtual register. | |||
| 38405 | Register SaveRBX = | |||
| 38406 | MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); | |||
| 38407 | BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), SaveRBX) | |||
| 38408 | .addReg(X86::RBX); | |||
| 38409 | // Generate mwaitx pseudo. | |||
| 38410 | Register Dst = MF->getRegInfo().createVirtualRegister(&X86::GR64RegClass); | |||
| 38411 | BuildMI(*BB, MI, DL, TII->get(X86::MWAITX_SAVE_RBX)) | |||
| 38412 | .addDef(Dst) // Destination tied in with SaveRBX. | |||
| 38413 | .addReg(MI.getOperand(2).getReg()) // input value of EBX. | |||
| 38414 | .addUse(SaveRBX); // Save of base pointer. | |||
| 38415 | MI.eraseFromParent(); | |||
| 38416 | } | |||
| 38417 | return BB; | |||
| 38418 | } | |||
| 38419 | case TargetOpcode::PREALLOCATED_SETUP: { | |||
| 38420 | assert(Subtarget.is32Bit() && "preallocated only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated only used in 32-bit" ) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated only used in 32-bit\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38420, __extension__ __PRETTY_FUNCTION__)); | |||
| 38421 | auto MFI = MF->getInfo<X86MachineFunctionInfo>(); | |||
| 38422 | MFI->setHasPreallocatedCall(true); | |||
| 38423 | int64_t PreallocatedId = MI.getOperand(0).getImm(); | |||
| 38424 | size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId); | |||
| 38425 | assert(StackAdjustment != 0 && "0 stack adjustment")(static_cast <bool> (StackAdjustment != 0 && "0 stack adjustment" ) ? void (0) : __assert_fail ("StackAdjustment != 0 && \"0 stack adjustment\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38425, __extension__ __PRETTY_FUNCTION__)); | |||
| 38426 | LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment " << StackAdjustment << "\n"; } } while (false) | |||
| 38427 | << StackAdjustment << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("x86-isel")) { dbgs() << "PREALLOCATED_SETUP stack adjustment " << StackAdjustment << "\n"; } } while (false); | |||
| 38428 | BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP) | |||
| 38429 | .addReg(X86::ESP) | |||
| 38430 | .addImm(StackAdjustment); | |||
| 38431 | MI.eraseFromParent(); | |||
| 38432 | return BB; | |||
| 38433 | } | |||
| 38434 | case TargetOpcode::PREALLOCATED_ARG: { | |||
| 38435 | assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit")(static_cast <bool> (Subtarget.is32Bit() && "preallocated calls only used in 32-bit" ) ? void (0) : __assert_fail ("Subtarget.is32Bit() && \"preallocated calls only used in 32-bit\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38435, __extension__ __PRETTY_FUNCTION__)); | |||
| 38436 | int64_t PreallocatedId = MI.getOperand(1).getImm(); | |||
| 38437 | int64_t ArgIdx = MI.getOperand(2).getImm(); | |||
| 38438 | auto MFI = MF->getInfo<X86MachineFunctionInfo>(); | |||
| 38439 | size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx]; | |||
| 38440 | LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdxdo { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx << ", arg offset " << ArgOffset << "\n"; } } while (false) | |||
| 38441 | << ", arg offset " << ArgOffset << "\n")do { if (::llvm::DebugFlag && ::llvm::isCurrentDebugType ("x86-isel")) { dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx << ", arg offset " << ArgOffset << "\n"; } } while (false); | |||
| 38442 | // stack pointer + offset | |||
| 38443 | addRegOffset( | |||
| 38444 | BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()), | |||
| 38445 | X86::ESP, false, ArgOffset); | |||
| 38446 | MI.eraseFromParent(); | |||
| 38447 | return BB; | |||
| 38448 | } | |||
| 38449 | case X86::PTDPBSSD: | |||
| 38450 | case X86::PTDPBSUD: | |||
| 38451 | case X86::PTDPBUSD: | |||
| 38452 | case X86::PTDPBUUD: | |||
| 38453 | case X86::PTDPBF16PS: | |||
| 38454 | case X86::PTDPFP16PS: { | |||
| 38455 | unsigned Opc; | |||
| 38456 | switch (MI.getOpcode()) { | |||
| 38457 | default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 38457); | |||
| 38458 | case X86::PTDPBSSD: Opc = X86::TDPBSSD; break; | |||
| 38459 | case X86::PTDPBSUD: Opc = X86::TDPBSUD; break; | |||
| 38460 | case X86::PTDPBUSD: Opc = X86::TDPBUSD; break; | |||
| 38461 | case X86::PTDPBUUD: Opc = X86::TDPBUUD; break; | |||
| 38462 | case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break; | |||
| 38463 | case X86::PTDPFP16PS: Opc = X86::TDPFP16PS; break; | |||
| 38464 | } | |||
| 38465 | ||||
| 38466 | MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); | |||
| 38467 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); | |||
| 38468 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); | |||
| 38469 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); | |||
| 38470 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); | |||
| 38471 | ||||
| 38472 | MI.eraseFromParent(); // The pseudo is gone now. | |||
| 38473 | return BB; | |||
| 38474 | } | |||
| 38475 | case X86::PTILEZERO: { | |||
| 38476 | unsigned Imm = MI.getOperand(0).getImm(); | |||
| 38477 | BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm)); | |||
| 38478 | MI.eraseFromParent(); // The pseudo is gone now. | |||
| 38479 | return BB; | |||
| 38480 | } | |||
| 38481 | case X86::PTILELOADD: | |||
| 38482 | case X86::PTILELOADDT1: | |||
| 38483 | case X86::PTILESTORED: { | |||
| 38484 | unsigned Opc; | |||
| 38485 | switch (MI.getOpcode()) { | |||
| 38486 | default: llvm_unreachable("illegal opcode!")::llvm::llvm_unreachable_internal("illegal opcode!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 38486); | |||
| 38487 | case X86::PTILELOADD: Opc = X86::TILELOADD; break; | |||
| 38488 | case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break; | |||
| 38489 | case X86::PTILESTORED: Opc = X86::TILESTORED; break; | |||
| 38490 | } | |||
| 38491 | ||||
| 38492 | MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); | |||
| 38493 | unsigned CurOp = 0; | |||
| 38494 | if (Opc != X86::TILESTORED) | |||
| 38495 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), | |||
| 38496 | RegState::Define); | |||
| 38497 | ||||
| 38498 | MIB.add(MI.getOperand(CurOp++)); // base | |||
| 38499 | MIB.add(MI.getOperand(CurOp++)); // scale | |||
| 38500 | MIB.add(MI.getOperand(CurOp++)); // index -- stride | |||
| 38501 | MIB.add(MI.getOperand(CurOp++)); // displacement | |||
| 38502 | MIB.add(MI.getOperand(CurOp++)); // segment | |||
| 38503 | ||||
| 38504 | if (Opc == X86::TILESTORED) | |||
| 38505 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()), | |||
| 38506 | RegState::Undef); | |||
| 38507 | ||||
| 38508 | MI.eraseFromParent(); // The pseudo is gone now. | |||
| 38509 | return BB; | |||
| 38510 | } | |||
| 38511 | case X86::PTCMMIMFP16PS: | |||
| 38512 | case X86::PTCMMRLFP16PS: { | |||
| 38513 | const DebugLoc &DL = MI.getDebugLoc(); | |||
| 38514 | unsigned Opc; | |||
| 38515 | switch (MI.getOpcode()) { | |||
| 38516 | default: llvm_unreachable("Unexpected instruction!")::llvm::llvm_unreachable_internal("Unexpected instruction!", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 38516); | |||
| 38517 | case X86::PTCMMIMFP16PS: Opc = X86::TCMMIMFP16PS; break; | |||
| 38518 | case X86::PTCMMRLFP16PS: Opc = X86::TCMMRLFP16PS; break; | |||
| 38519 | } | |||
| 38520 | MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc)); | |||
| 38521 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define); | |||
| 38522 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef); | |||
| 38523 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef); | |||
| 38524 | MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef); | |||
| 38525 | MI.eraseFromParent(); // The pseudo is gone now. | |||
| 38526 | return BB; | |||
| 38527 | } | |||
| 38528 | } | |||
| 38529 | } | |||
| 38530 | ||||
| 38531 | //===----------------------------------------------------------------------===// | |||
| 38532 | // X86 Optimization Hooks | |||
| 38533 | //===----------------------------------------------------------------------===// | |||
| 38534 | ||||
| 38535 | bool | |||
| 38536 | X86TargetLowering::targetShrinkDemandedConstant(SDValue Op, | |||
| 38537 | const APInt &DemandedBits, | |||
| 38538 | const APInt &DemandedElts, | |||
| 38539 | TargetLoweringOpt &TLO) const { | |||
| 38540 | EVT VT = Op.getValueType(); | |||
| 38541 | unsigned Opcode = Op.getOpcode(); | |||
| 38542 | unsigned EltSize = VT.getScalarSizeInBits(); | |||
| 38543 | ||||
| 38544 | if (VT.isVector()) { | |||
| 38545 | // If the constant is only all signbits in the active bits, then we should | |||
| 38546 | // extend it to the entire constant to allow it act as a boolean constant | |||
| 38547 | // vector. | |||
| 38548 | auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) { | |||
| 38549 | if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode())) | |||
| 38550 | return false; | |||
| 38551 | for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) { | |||
| 38552 | if (!DemandedElts[i] || V.getOperand(i).isUndef()) | |||
| 38553 | continue; | |||
| 38554 | const APInt &Val = V.getConstantOperandAPInt(i); | |||
| 38555 | if (Val.getBitWidth() > Val.getNumSignBits() && | |||
| 38556 | Val.trunc(ActiveBits).getNumSignBits() == ActiveBits) | |||
| 38557 | return true; | |||
| 38558 | } | |||
| 38559 | return false; | |||
| 38560 | }; | |||
| 38561 | // For vectors - if we have a constant, then try to sign extend. | |||
| 38562 | // TODO: Handle AND/ANDN cases. | |||
| 38563 | unsigned ActiveBits = DemandedBits.getActiveBits(); | |||
| 38564 | if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) && | |||
| 38565 | (Opcode == ISD::OR || Opcode == ISD::XOR) && | |||
| 38566 | NeedsSignExtension(Op.getOperand(1), ActiveBits)) { | |||
| 38567 | EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits); | |||
| 38568 | EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT, | |||
| 38569 | VT.getVectorNumElements()); | |||
| 38570 | SDValue NewC = | |||
| 38571 | TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT, | |||
| 38572 | Op.getOperand(1), TLO.DAG.getValueType(ExtVT)); | |||
| 38573 | SDValue NewOp = | |||
| 38574 | TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC); | |||
| 38575 | return TLO.CombineTo(Op, NewOp); | |||
| 38576 | } | |||
| 38577 | return false; | |||
| 38578 | } | |||
| 38579 | ||||
| 38580 | // Only optimize Ands to prevent shrinking a constant that could be | |||
| 38581 | // matched by movzx. | |||
| 38582 | if (Opcode != ISD::AND) | |||
| 38583 | return false; | |||
| 38584 | ||||
| 38585 | // Make sure the RHS really is a constant. | |||
| 38586 | ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)); | |||
| 38587 | if (!C) | |||
| 38588 | return false; | |||
| 38589 | ||||
| 38590 | const APInt &Mask = C->getAPIntValue(); | |||
| 38591 | ||||
| 38592 | // Clear all non-demanded bits initially. | |||
| 38593 | APInt ShrunkMask = Mask & DemandedBits; | |||
| 38594 | ||||
| 38595 | // Find the width of the shrunk mask. | |||
| 38596 | unsigned Width = ShrunkMask.getActiveBits(); | |||
| 38597 | ||||
| 38598 | // If the mask is all 0s there's nothing to do here. | |||
| 38599 | if (Width == 0) | |||
| 38600 | return false; | |||
| 38601 | ||||
| 38602 | // Find the next power of 2 width, rounding up to a byte. | |||
| 38603 | Width = llvm::bit_ceil(std::max(Width, 8U)); | |||
| 38604 | // Truncate the width to size to handle illegal types. | |||
| 38605 | Width = std::min(Width, EltSize); | |||
| 38606 | ||||
| 38607 | // Calculate a possible zero extend mask for this constant. | |||
| 38608 | APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width); | |||
| 38609 | ||||
| 38610 | // If we aren't changing the mask, just return true to keep it and prevent | |||
| 38611 | // the caller from optimizing. | |||
| 38612 | if (ZeroExtendMask == Mask) | |||
| 38613 | return true; | |||
| 38614 | ||||
| 38615 | // Make sure the new mask can be represented by a combination of mask bits | |||
| 38616 | // and non-demanded bits. | |||
| 38617 | if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits)) | |||
| 38618 | return false; | |||
| 38619 | ||||
| 38620 | // Replace the constant with the zero extend mask. | |||
| 38621 | SDLoc DL(Op); | |||
| 38622 | SDValue NewC = TLO.DAG.getConstant(ZeroExtendMask, DL, VT); | |||
| 38623 | SDValue NewOp = TLO.DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), NewC); | |||
| 38624 | return TLO.CombineTo(Op, NewOp); | |||
| 38625 | } | |||
| 38626 | ||||
| 38627 | void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, | |||
| 38628 | KnownBits &Known, | |||
| 38629 | const APInt &DemandedElts, | |||
| 38630 | const SelectionDAG &DAG, | |||
| 38631 | unsigned Depth) const { | |||
| 38632 | unsigned BitWidth = Known.getBitWidth(); | |||
| 38633 | unsigned NumElts = DemandedElts.getBitWidth(); | |||
| 38634 | unsigned Opc = Op.getOpcode(); | |||
| 38635 | EVT VT = Op.getValueType(); | |||
| 38636 | assert((Opc >= ISD::BUILTIN_OP_END ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)) | |||
| 38637 | Opc == ISD::INTRINSIC_WO_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)) | |||
| 38638 | Opc == ISD::INTRINSIC_W_CHAIN ||(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)) | |||
| 38639 | Opc == ISD::INTRINSIC_VOID) &&(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)) | |||
| 38640 | "Should use MaskedValueIsZero if you don't know whether Op"(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)) | |||
| 38641 | " is a target node!")(static_cast <bool> ((Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && "Should use MaskedValueIsZero if you don't know whether Op" " is a target node!") ? void (0) : __assert_fail ("(Opc >= ISD::BUILTIN_OP_END || Opc == ISD::INTRINSIC_WO_CHAIN || Opc == ISD::INTRINSIC_W_CHAIN || Opc == ISD::INTRINSIC_VOID) && \"Should use MaskedValueIsZero if you don't know whether Op\" \" is a target node!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38641, __extension__ __PRETTY_FUNCTION__)); | |||
| 38642 | ||||
| 38643 | Known.resetAll(); | |||
| 38644 | switch (Opc) { | |||
| 38645 | default: break; | |||
| 38646 | case X86ISD::MUL_IMM: { | |||
| 38647 | KnownBits Known2; | |||
| 38648 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38649 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38650 | Known = KnownBits::mul(Known, Known2); | |||
| 38651 | break; | |||
| 38652 | } | |||
| 38653 | case X86ISD::SETCC: | |||
| 38654 | Known.Zero.setBitsFrom(1); | |||
| 38655 | break; | |||
| 38656 | case X86ISD::MOVMSK: { | |||
| 38657 | unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements(); | |||
| 38658 | Known.Zero.setBitsFrom(NumLoBits); | |||
| 38659 | break; | |||
| 38660 | } | |||
| 38661 | case X86ISD::PEXTRB: | |||
| 38662 | case X86ISD::PEXTRW: { | |||
| 38663 | SDValue Src = Op.getOperand(0); | |||
| 38664 | EVT SrcVT = Src.getValueType(); | |||
| 38665 | APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), | |||
| 38666 | Op.getConstantOperandVal(1)); | |||
| 38667 | Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1); | |||
| 38668 | Known = Known.anyextOrTrunc(BitWidth); | |||
| 38669 | Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); | |||
| 38670 | break; | |||
| 38671 | } | |||
| 38672 | case X86ISD::VSRAI: | |||
| 38673 | case X86ISD::VSHLI: | |||
| 38674 | case X86ISD::VSRLI: { | |||
| 38675 | unsigned ShAmt = Op.getConstantOperandVal(1); | |||
| 38676 | if (ShAmt >= VT.getScalarSizeInBits()) { | |||
| 38677 | // Out of range logical bit shifts are guaranteed to be zero. | |||
| 38678 | // Out of range arithmetic bit shifts splat the sign bit. | |||
| 38679 | if (Opc != X86ISD::VSRAI) { | |||
| 38680 | Known.setAllZero(); | |||
| 38681 | break; | |||
| 38682 | } | |||
| 38683 | ||||
| 38684 | ShAmt = VT.getScalarSizeInBits() - 1; | |||
| 38685 | } | |||
| 38686 | ||||
| 38687 | Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38688 | if (Opc == X86ISD::VSHLI) { | |||
| 38689 | Known.Zero <<= ShAmt; | |||
| 38690 | Known.One <<= ShAmt; | |||
| 38691 | // Low bits are known zero. | |||
| 38692 | Known.Zero.setLowBits(ShAmt); | |||
| 38693 | } else if (Opc == X86ISD::VSRLI) { | |||
| 38694 | Known.Zero.lshrInPlace(ShAmt); | |||
| 38695 | Known.One.lshrInPlace(ShAmt); | |||
| 38696 | // High bits are known zero. | |||
| 38697 | Known.Zero.setHighBits(ShAmt); | |||
| 38698 | } else { | |||
| 38699 | Known.Zero.ashrInPlace(ShAmt); | |||
| 38700 | Known.One.ashrInPlace(ShAmt); | |||
| 38701 | } | |||
| 38702 | break; | |||
| 38703 | } | |||
| 38704 | case X86ISD::PACKUS: { | |||
| 38705 | // PACKUS is just a truncation if the upper half is zero. | |||
| 38706 | APInt DemandedLHS, DemandedRHS; | |||
| 38707 | getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); | |||
| 38708 | ||||
| 38709 | Known.One = APInt::getAllOnes(BitWidth * 2); | |||
| 38710 | Known.Zero = APInt::getAllOnes(BitWidth * 2); | |||
| 38711 | ||||
| 38712 | KnownBits Known2; | |||
| 38713 | if (!!DemandedLHS) { | |||
| 38714 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedLHS, Depth + 1); | |||
| 38715 | Known = KnownBits::commonBits(Known, Known2); | |||
| 38716 | } | |||
| 38717 | if (!!DemandedRHS) { | |||
| 38718 | Known2 = DAG.computeKnownBits(Op.getOperand(1), DemandedRHS, Depth + 1); | |||
| 38719 | Known = KnownBits::commonBits(Known, Known2); | |||
| 38720 | } | |||
| 38721 | ||||
| 38722 | if (Known.countMinLeadingZeros() < BitWidth) | |||
| 38723 | Known.resetAll(); | |||
| 38724 | Known = Known.trunc(BitWidth); | |||
| 38725 | break; | |||
| 38726 | } | |||
| 38727 | case X86ISD::VBROADCAST: { | |||
| 38728 | SDValue Src = Op.getOperand(0); | |||
| 38729 | if (!Src.getSimpleValueType().isVector()) { | |||
| 38730 | Known = DAG.computeKnownBits(Src, Depth + 1); | |||
| 38731 | return; | |||
| 38732 | } | |||
| 38733 | break; | |||
| 38734 | } | |||
| 38735 | case X86ISD::AND: { | |||
| 38736 | if (Op.getResNo() == 0) { | |||
| 38737 | KnownBits Known2; | |||
| 38738 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38739 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38740 | Known &= Known2; | |||
| 38741 | } | |||
| 38742 | break; | |||
| 38743 | } | |||
| 38744 | case X86ISD::ANDNP: { | |||
| 38745 | KnownBits Known2; | |||
| 38746 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38747 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38748 | ||||
| 38749 | // ANDNP = (~X & Y); | |||
| 38750 | Known.One &= Known2.Zero; | |||
| 38751 | Known.Zero |= Known2.One; | |||
| 38752 | break; | |||
| 38753 | } | |||
| 38754 | case X86ISD::FOR: { | |||
| 38755 | KnownBits Known2; | |||
| 38756 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38757 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38758 | ||||
| 38759 | Known |= Known2; | |||
| 38760 | break; | |||
| 38761 | } | |||
| 38762 | case X86ISD::PSADBW: { | |||
| 38763 | assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__ __PRETTY_FUNCTION__)) | |||
| 38764 | Op.getOperand(0).getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__ __PRETTY_FUNCTION__)) | |||
| 38765 | "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && "Unexpected PSADBW types") ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && Op.getOperand(0).getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38765, __extension__ __PRETTY_FUNCTION__)); | |||
| 38766 | ||||
| 38767 | // PSADBW - fills low 16 bits and zeros upper 48 bits of each i64 result. | |||
| 38768 | Known.Zero.setBitsFrom(16); | |||
| 38769 | break; | |||
| 38770 | } | |||
| 38771 | case X86ISD::PCMPGT: | |||
| 38772 | case X86ISD::PCMPEQ: { | |||
| 38773 | KnownBits KnownLhs = | |||
| 38774 | DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38775 | KnownBits KnownRhs = | |||
| 38776 | DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38777 | std::optional<bool> Res = Opc == X86ISD::PCMPEQ | |||
| 38778 | ? KnownBits::eq(KnownLhs, KnownRhs) | |||
| 38779 | : KnownBits::sgt(KnownLhs, KnownRhs); | |||
| 38780 | if (Res) { | |||
| 38781 | if (*Res) | |||
| 38782 | Known.setAllOnes(); | |||
| 38783 | else | |||
| 38784 | Known.setAllZero(); | |||
| 38785 | } | |||
| 38786 | break; | |||
| 38787 | } | |||
| 38788 | case X86ISD::PMULUDQ: { | |||
| 38789 | KnownBits Known2; | |||
| 38790 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38791 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38792 | ||||
| 38793 | Known = Known.trunc(BitWidth / 2).zext(BitWidth); | |||
| 38794 | Known2 = Known2.trunc(BitWidth / 2).zext(BitWidth); | |||
| 38795 | Known = KnownBits::mul(Known, Known2); | |||
| 38796 | break; | |||
| 38797 | } | |||
| 38798 | case X86ISD::CMOV: { | |||
| 38799 | Known = DAG.computeKnownBits(Op.getOperand(1), Depth + 1); | |||
| 38800 | // If we don't know any bits, early out. | |||
| 38801 | if (Known.isUnknown()) | |||
| 38802 | break; | |||
| 38803 | KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); | |||
| 38804 | ||||
| 38805 | // Only known if known in both the LHS and RHS. | |||
| 38806 | Known = KnownBits::commonBits(Known, Known2); | |||
| 38807 | break; | |||
| 38808 | } | |||
| 38809 | case X86ISD::BEXTR: | |||
| 38810 | case X86ISD::BEXTRI: { | |||
| 38811 | SDValue Op0 = Op.getOperand(0); | |||
| 38812 | SDValue Op1 = Op.getOperand(1); | |||
| 38813 | ||||
| 38814 | if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) { | |||
| 38815 | unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); | |||
| 38816 | unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); | |||
| 38817 | ||||
| 38818 | // If the length is 0, the result is 0. | |||
| 38819 | if (Length == 0) { | |||
| 38820 | Known.setAllZero(); | |||
| 38821 | break; | |||
| 38822 | } | |||
| 38823 | ||||
| 38824 | if ((Shift + Length) <= BitWidth) { | |||
| 38825 | Known = DAG.computeKnownBits(Op0, Depth + 1); | |||
| 38826 | Known = Known.extractBits(Length, Shift); | |||
| 38827 | Known = Known.zextOrTrunc(BitWidth); | |||
| 38828 | } | |||
| 38829 | } | |||
| 38830 | break; | |||
| 38831 | } | |||
| 38832 | case X86ISD::PDEP: { | |||
| 38833 | KnownBits Known2; | |||
| 38834 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38835 | Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 38836 | // Zeros are retained from the mask operand. But not ones. | |||
| 38837 | Known.One.clearAllBits(); | |||
| 38838 | // The result will have at least as many trailing zeros as the non-mask | |||
| 38839 | // operand since bits can only map to the same or higher bit position. | |||
| 38840 | Known.Zero.setLowBits(Known2.countMinTrailingZeros()); | |||
| 38841 | break; | |||
| 38842 | } | |||
| 38843 | case X86ISD::PEXT: { | |||
| 38844 | Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 38845 | // The result has as many leading zeros as the number of zeroes in the mask. | |||
| 38846 | unsigned Count = Known.Zero.popcount(); | |||
| 38847 | Known.Zero = APInt::getHighBitsSet(BitWidth, Count); | |||
| 38848 | Known.One.clearAllBits(); | |||
| 38849 | break; | |||
| 38850 | } | |||
| 38851 | case X86ISD::VTRUNC: | |||
| 38852 | case X86ISD::VTRUNCS: | |||
| 38853 | case X86ISD::VTRUNCUS: | |||
| 38854 | case X86ISD::CVTSI2P: | |||
| 38855 | case X86ISD::CVTUI2P: | |||
| 38856 | case X86ISD::CVTP2SI: | |||
| 38857 | case X86ISD::CVTP2UI: | |||
| 38858 | case X86ISD::MCVTP2SI: | |||
| 38859 | case X86ISD::MCVTP2UI: | |||
| 38860 | case X86ISD::CVTTP2SI: | |||
| 38861 | case X86ISD::CVTTP2UI: | |||
| 38862 | case X86ISD::MCVTTP2SI: | |||
| 38863 | case X86ISD::MCVTTP2UI: | |||
| 38864 | case X86ISD::MCVTSI2P: | |||
| 38865 | case X86ISD::MCVTUI2P: | |||
| 38866 | case X86ISD::VFPROUND: | |||
| 38867 | case X86ISD::VMFPROUND: | |||
| 38868 | case X86ISD::CVTPS2PH: | |||
| 38869 | case X86ISD::MCVTPS2PH: { | |||
| 38870 | // Truncations/Conversions - upper elements are known zero. | |||
| 38871 | EVT SrcVT = Op.getOperand(0).getValueType(); | |||
| 38872 | if (SrcVT.isVector()) { | |||
| 38873 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 38874 | if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts) | |||
| 38875 | Known.setAllZero(); | |||
| 38876 | } | |||
| 38877 | break; | |||
| 38878 | } | |||
| 38879 | case X86ISD::STRICT_CVTTP2SI: | |||
| 38880 | case X86ISD::STRICT_CVTTP2UI: | |||
| 38881 | case X86ISD::STRICT_CVTSI2P: | |||
| 38882 | case X86ISD::STRICT_CVTUI2P: | |||
| 38883 | case X86ISD::STRICT_VFPROUND: | |||
| 38884 | case X86ISD::STRICT_CVTPS2PH: { | |||
| 38885 | // Strict Conversions - upper elements are known zero. | |||
| 38886 | EVT SrcVT = Op.getOperand(1).getValueType(); | |||
| 38887 | if (SrcVT.isVector()) { | |||
| 38888 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 38889 | if (NumElts > NumSrcElts && DemandedElts.countr_zero() >= NumSrcElts) | |||
| 38890 | Known.setAllZero(); | |||
| 38891 | } | |||
| 38892 | break; | |||
| 38893 | } | |||
| 38894 | case X86ISD::MOVQ2DQ: { | |||
| 38895 | // Move from MMX to XMM. Upper half of XMM should be 0. | |||
| 38896 | if (DemandedElts.countr_zero() >= (NumElts / 2)) | |||
| 38897 | Known.setAllZero(); | |||
| 38898 | break; | |||
| 38899 | } | |||
| 38900 | case X86ISD::VBROADCAST_LOAD: { | |||
| 38901 | APInt UndefElts; | |||
| 38902 | SmallVector<APInt, 16> EltBits; | |||
| 38903 | if (getTargetConstantBitsFromNode(Op, BitWidth, UndefElts, EltBits, | |||
| 38904 | /*AllowWholeUndefs*/ false, | |||
| 38905 | /*AllowPartialUndefs*/ false)) { | |||
| 38906 | Known.Zero.setAllBits(); | |||
| 38907 | Known.One.setAllBits(); | |||
| 38908 | for (unsigned I = 0; I != NumElts; ++I) { | |||
| 38909 | if (!DemandedElts[I]) | |||
| 38910 | continue; | |||
| 38911 | if (UndefElts[I]) { | |||
| 38912 | Known.resetAll(); | |||
| 38913 | break; | |||
| 38914 | } | |||
| 38915 | KnownBits Known2 = KnownBits::makeConstant(EltBits[I]); | |||
| 38916 | Known = KnownBits::commonBits(Known, Known2); | |||
| 38917 | } | |||
| 38918 | return; | |||
| 38919 | } | |||
| 38920 | break; | |||
| 38921 | } | |||
| 38922 | } | |||
| 38923 | ||||
| 38924 | // Handle target shuffles. | |||
| 38925 | // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. | |||
| 38926 | if (isTargetShuffle(Opc)) { | |||
| 38927 | SmallVector<int, 64> Mask; | |||
| 38928 | SmallVector<SDValue, 2> Ops; | |||
| 38929 | if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) { | |||
| 38930 | unsigned NumOps = Ops.size(); | |||
| 38931 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 38932 | if (Mask.size() == NumElts) { | |||
| 38933 | SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0)); | |||
| 38934 | Known.Zero.setAllBits(); Known.One.setAllBits(); | |||
| 38935 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 38936 | if (!DemandedElts[i]) | |||
| 38937 | continue; | |||
| 38938 | int M = Mask[i]; | |||
| 38939 | if (M == SM_SentinelUndef) { | |||
| 38940 | // For UNDEF elements, we don't know anything about the common state | |||
| 38941 | // of the shuffle result. | |||
| 38942 | Known.resetAll(); | |||
| 38943 | break; | |||
| 38944 | } | |||
| 38945 | if (M == SM_SentinelZero) { | |||
| 38946 | Known.One.clearAllBits(); | |||
| 38947 | continue; | |||
| 38948 | } | |||
| 38949 | assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__ __PRETTY_FUNCTION__)) | |||
| 38950 | "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38950, __extension__ __PRETTY_FUNCTION__)); | |||
| 38951 | ||||
| 38952 | unsigned OpIdx = (unsigned)M / NumElts; | |||
| 38953 | unsigned EltIdx = (unsigned)M % NumElts; | |||
| 38954 | if (Ops[OpIdx].getValueType() != VT) { | |||
| 38955 | // TODO - handle target shuffle ops with different value types. | |||
| 38956 | Known.resetAll(); | |||
| 38957 | break; | |||
| 38958 | } | |||
| 38959 | DemandedOps[OpIdx].setBit(EltIdx); | |||
| 38960 | } | |||
| 38961 | // Known bits are the values that are shared by every demanded element. | |||
| 38962 | for (unsigned i = 0; i != NumOps && !Known.isUnknown(); ++i) { | |||
| 38963 | if (!DemandedOps[i]) | |||
| 38964 | continue; | |||
| 38965 | KnownBits Known2 = | |||
| 38966 | DAG.computeKnownBits(Ops[i], DemandedOps[i], Depth + 1); | |||
| 38967 | Known = KnownBits::commonBits(Known, Known2); | |||
| 38968 | } | |||
| 38969 | } | |||
| 38970 | } | |||
| 38971 | } | |||
| 38972 | } | |||
| 38973 | ||||
| 38974 | unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( | |||
| 38975 | SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, | |||
| 38976 | unsigned Depth) const { | |||
| 38977 | EVT VT = Op.getValueType(); | |||
| 38978 | unsigned VTBits = VT.getScalarSizeInBits(); | |||
| 38979 | unsigned Opcode = Op.getOpcode(); | |||
| 38980 | switch (Opcode) { | |||
| 38981 | case X86ISD::SETCC_CARRY: | |||
| 38982 | // SETCC_CARRY sets the dest to ~0 for true or 0 for false. | |||
| 38983 | return VTBits; | |||
| 38984 | ||||
| 38985 | case X86ISD::VTRUNC: { | |||
| 38986 | SDValue Src = Op.getOperand(0); | |||
| 38987 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 38988 | unsigned NumSrcBits = SrcVT.getScalarSizeInBits(); | |||
| 38989 | assert(VTBits < NumSrcBits && "Illegal truncation input type")(static_cast <bool> (VTBits < NumSrcBits && "Illegal truncation input type" ) ? void (0) : __assert_fail ("VTBits < NumSrcBits && \"Illegal truncation input type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 38989, __extension__ __PRETTY_FUNCTION__)); | |||
| 38990 | APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); | |||
| 38991 | unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1); | |||
| 38992 | if (Tmp > (NumSrcBits - VTBits)) | |||
| 38993 | return Tmp - (NumSrcBits - VTBits); | |||
| 38994 | return 1; | |||
| 38995 | } | |||
| 38996 | ||||
| 38997 | case X86ISD::PACKSS: { | |||
| 38998 | // PACKSS is just a truncation if the sign bits extend to the packed size. | |||
| 38999 | APInt DemandedLHS, DemandedRHS; | |||
| 39000 | getPackDemandedElts(Op.getValueType(), DemandedElts, DemandedLHS, | |||
| 39001 | DemandedRHS); | |||
| 39002 | ||||
| 39003 | unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); | |||
| 39004 | unsigned Tmp0 = SrcBits, Tmp1 = SrcBits; | |||
| 39005 | if (!!DemandedLHS) | |||
| 39006 | Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); | |||
| 39007 | if (!!DemandedRHS) | |||
| 39008 | Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); | |||
| 39009 | unsigned Tmp = std::min(Tmp0, Tmp1); | |||
| 39010 | if (Tmp > (SrcBits - VTBits)) | |||
| 39011 | return Tmp - (SrcBits - VTBits); | |||
| 39012 | return 1; | |||
| 39013 | } | |||
| 39014 | ||||
| 39015 | case X86ISD::VBROADCAST: { | |||
| 39016 | SDValue Src = Op.getOperand(0); | |||
| 39017 | if (!Src.getSimpleValueType().isVector()) | |||
| 39018 | return DAG.ComputeNumSignBits(Src, Depth + 1); | |||
| 39019 | break; | |||
| 39020 | } | |||
| 39021 | ||||
| 39022 | case X86ISD::VSHLI: { | |||
| 39023 | SDValue Src = Op.getOperand(0); | |||
| 39024 | const APInt &ShiftVal = Op.getConstantOperandAPInt(1); | |||
| 39025 | if (ShiftVal.uge(VTBits)) | |||
| 39026 | return VTBits; // Shifted all bits out --> zero. | |||
| 39027 | unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); | |||
| 39028 | if (ShiftVal.uge(Tmp)) | |||
| 39029 | return 1; // Shifted all sign bits out --> unknown. | |||
| 39030 | return Tmp - ShiftVal.getZExtValue(); | |||
| 39031 | } | |||
| 39032 | ||||
| 39033 | case X86ISD::VSRAI: { | |||
| 39034 | SDValue Src = Op.getOperand(0); | |||
| 39035 | APInt ShiftVal = Op.getConstantOperandAPInt(1); | |||
| 39036 | if (ShiftVal.uge(VTBits - 1)) | |||
| 39037 | return VTBits; // Sign splat. | |||
| 39038 | unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); | |||
| 39039 | ShiftVal += Tmp; | |||
| 39040 | return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); | |||
| 39041 | } | |||
| 39042 | ||||
| 39043 | case X86ISD::FSETCC: | |||
| 39044 | // cmpss/cmpsd return zero/all-bits result values in the bottom element. | |||
| 39045 | if (VT == MVT::f32 || VT == MVT::f64 || | |||
| 39046 | ((VT == MVT::v4f32 || VT == MVT::v2f64) && DemandedElts == 1)) | |||
| 39047 | return VTBits; | |||
| 39048 | break; | |||
| 39049 | ||||
| 39050 | case X86ISD::PCMPGT: | |||
| 39051 | case X86ISD::PCMPEQ: | |||
| 39052 | case X86ISD::CMPP: | |||
| 39053 | case X86ISD::VPCOM: | |||
| 39054 | case X86ISD::VPCOMU: | |||
| 39055 | // Vector compares return zero/all-bits result values. | |||
| 39056 | return VTBits; | |||
| 39057 | ||||
| 39058 | case X86ISD::ANDNP: { | |||
| 39059 | unsigned Tmp0 = | |||
| 39060 | DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1); | |||
| 39061 | if (Tmp0 == 1) return 1; // Early out. | |||
| 39062 | unsigned Tmp1 = | |||
| 39063 | DAG.ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1); | |||
| 39064 | return std::min(Tmp0, Tmp1); | |||
| 39065 | } | |||
| 39066 | ||||
| 39067 | case X86ISD::CMOV: { | |||
| 39068 | unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); | |||
| 39069 | if (Tmp0 == 1) return 1; // Early out. | |||
| 39070 | unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); | |||
| 39071 | return std::min(Tmp0, Tmp1); | |||
| 39072 | } | |||
| 39073 | } | |||
| 39074 | ||||
| 39075 | // Handle target shuffles. | |||
| 39076 | // TODO - use resolveTargetShuffleInputs once we can limit recursive depth. | |||
| 39077 | if (isTargetShuffle(Opcode)) { | |||
| 39078 | SmallVector<int, 64> Mask; | |||
| 39079 | SmallVector<SDValue, 2> Ops; | |||
| 39080 | if (getTargetShuffleMask(Op.getNode(), VT.getSimpleVT(), true, Ops, Mask)) { | |||
| 39081 | unsigned NumOps = Ops.size(); | |||
| 39082 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 39083 | if (Mask.size() == NumElts) { | |||
| 39084 | SmallVector<APInt, 2> DemandedOps(NumOps, APInt(NumElts, 0)); | |||
| 39085 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 39086 | if (!DemandedElts[i]) | |||
| 39087 | continue; | |||
| 39088 | int M = Mask[i]; | |||
| 39089 | if (M == SM_SentinelUndef) { | |||
| 39090 | // For UNDEF elements, we don't know anything about the common state | |||
| 39091 | // of the shuffle result. | |||
| 39092 | return 1; | |||
| 39093 | } else if (M == SM_SentinelZero) { | |||
| 39094 | // Zero = all sign bits. | |||
| 39095 | continue; | |||
| 39096 | } | |||
| 39097 | assert(0 <= M && (unsigned)M < (NumOps * NumElts) &&(static_cast <bool> (0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__ __PRETTY_FUNCTION__)) | |||
| 39098 | "Shuffle index out of range")(static_cast <bool> (0 <= M && (unsigned)M < (NumOps * NumElts) && "Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && (unsigned)M < (NumOps * NumElts) && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39098, __extension__ __PRETTY_FUNCTION__)); | |||
| 39099 | ||||
| 39100 | unsigned OpIdx = (unsigned)M / NumElts; | |||
| 39101 | unsigned EltIdx = (unsigned)M % NumElts; | |||
| 39102 | if (Ops[OpIdx].getValueType() != VT) { | |||
| 39103 | // TODO - handle target shuffle ops with different value types. | |||
| 39104 | return 1; | |||
| 39105 | } | |||
| 39106 | DemandedOps[OpIdx].setBit(EltIdx); | |||
| 39107 | } | |||
| 39108 | unsigned Tmp0 = VTBits; | |||
| 39109 | for (unsigned i = 0; i != NumOps && Tmp0 > 1; ++i) { | |||
| 39110 | if (!DemandedOps[i]) | |||
| 39111 | continue; | |||
| 39112 | unsigned Tmp1 = | |||
| 39113 | DAG.ComputeNumSignBits(Ops[i], DemandedOps[i], Depth + 1); | |||
| 39114 | Tmp0 = std::min(Tmp0, Tmp1); | |||
| 39115 | } | |||
| 39116 | return Tmp0; | |||
| 39117 | } | |||
| 39118 | } | |||
| 39119 | } | |||
| 39120 | ||||
| 39121 | // Fallback case. | |||
| 39122 | return 1; | |||
| 39123 | } | |||
| 39124 | ||||
| 39125 | SDValue X86TargetLowering::unwrapAddress(SDValue N) const { | |||
| 39126 | if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) | |||
| 39127 | return N->getOperand(0); | |||
| 39128 | return N; | |||
| 39129 | } | |||
| 39130 | ||||
| 39131 | // Helper to look for a normal load that can be narrowed into a vzload with the | |||
| 39132 | // specified VT and memory VT. Returns SDValue() on failure. | |||
| 39133 | static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT, | |||
| 39134 | SelectionDAG &DAG) { | |||
| 39135 | // Can't if the load is volatile or atomic. | |||
| 39136 | if (!LN->isSimple()) | |||
| 39137 | return SDValue(); | |||
| 39138 | ||||
| 39139 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 39140 | SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; | |||
| 39141 | return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT, | |||
| 39142 | LN->getPointerInfo(), LN->getOriginalAlign(), | |||
| 39143 | LN->getMemOperand()->getFlags()); | |||
| 39144 | } | |||
| 39145 | ||||
| 39146 | // Attempt to match a combined shuffle mask against supported unary shuffle | |||
| 39147 | // instructions. | |||
| 39148 | // TODO: Investigate sharing more of this with shuffle lowering. | |||
| 39149 | static bool matchUnaryShuffle(MVT MaskVT, ArrayRef<int> Mask, | |||
| 39150 | bool AllowFloatDomain, bool AllowIntDomain, | |||
| 39151 | SDValue V1, const SelectionDAG &DAG, | |||
| 39152 | const X86Subtarget &Subtarget, unsigned &Shuffle, | |||
| 39153 | MVT &SrcVT, MVT &DstVT) { | |||
| 39154 | unsigned NumMaskElts = Mask.size(); | |||
| 39155 | unsigned MaskEltSize = MaskVT.getScalarSizeInBits(); | |||
| 39156 | ||||
| 39157 | // Match against a VZEXT_MOVL vXi32 and vXi16 zero-extending instruction. | |||
| 39158 | if (Mask[0] == 0 && | |||
| 39159 | (MaskEltSize == 32 || (MaskEltSize == 16 && Subtarget.hasFP16()))) { | |||
| 39160 | if ((isUndefOrZero(Mask[1]) && isUndefInRange(Mask, 2, NumMaskElts - 2)) || | |||
| 39161 | (V1.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 39162 | isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1))) { | |||
| 39163 | Shuffle = X86ISD::VZEXT_MOVL; | |||
| 39164 | if (MaskEltSize == 16) | |||
| 39165 | SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); | |||
| 39166 | else | |||
| 39167 | SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; | |||
| 39168 | return true; | |||
| 39169 | } | |||
| 39170 | } | |||
| 39171 | ||||
| 39172 | // Match against a ANY/ZERO_EXTEND_VECTOR_INREG instruction. | |||
| 39173 | // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). | |||
| 39174 | if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || | |||
| 39175 | (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { | |||
| 39176 | unsigned MaxScale = 64 / MaskEltSize; | |||
| 39177 | for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { | |||
| 39178 | bool MatchAny = true; | |||
| 39179 | bool MatchZero = true; | |||
| 39180 | unsigned NumDstElts = NumMaskElts / Scale; | |||
| 39181 | for (unsigned i = 0; i != NumDstElts && (MatchAny || MatchZero); ++i) { | |||
| 39182 | if (!isUndefOrEqual(Mask[i * Scale], (int)i)) { | |||
| 39183 | MatchAny = MatchZero = false; | |||
| 39184 | break; | |||
| 39185 | } | |||
| 39186 | MatchAny &= isUndefInRange(Mask, (i * Scale) + 1, Scale - 1); | |||
| 39187 | MatchZero &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1); | |||
| 39188 | } | |||
| 39189 | if (MatchAny || MatchZero) { | |||
| 39190 | assert(MatchZero && "Failed to match zext but matched aext?")(static_cast <bool> (MatchZero && "Failed to match zext but matched aext?" ) ? void (0) : __assert_fail ("MatchZero && \"Failed to match zext but matched aext?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39190, __extension__ __PRETTY_FUNCTION__)); | |||
| 39191 | unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); | |||
| 39192 | MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : | |||
| 39193 | MVT::getIntegerVT(MaskEltSize); | |||
| 39194 | SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); | |||
| 39195 | ||||
| 39196 | Shuffle = unsigned(MatchAny ? ISD::ANY_EXTEND : ISD::ZERO_EXTEND); | |||
| 39197 | if (SrcVT.getVectorNumElements() != NumDstElts) | |||
| 39198 | Shuffle = DAG.getOpcode_EXTEND_VECTOR_INREG(Shuffle); | |||
| 39199 | ||||
| 39200 | DstVT = MVT::getIntegerVT(Scale * MaskEltSize); | |||
| 39201 | DstVT = MVT::getVectorVT(DstVT, NumDstElts); | |||
| 39202 | return true; | |||
| 39203 | } | |||
| 39204 | } | |||
| 39205 | } | |||
| 39206 | ||||
| 39207 | // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS). | |||
| 39208 | if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2()) || | |||
| 39209 | (MaskEltSize == 16 && Subtarget.hasFP16())) && | |||
| 39210 | isUndefOrEqual(Mask[0], 0) && | |||
| 39211 | isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) { | |||
| 39212 | Shuffle = X86ISD::VZEXT_MOVL; | |||
| 39213 | if (MaskEltSize == 16) | |||
| 39214 | SrcVT = DstVT = MaskVT.changeVectorElementType(MVT::f16); | |||
| 39215 | else | |||
| 39216 | SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT; | |||
| 39217 | return true; | |||
| 39218 | } | |||
| 39219 | ||||
| 39220 | // Check if we have SSE3 which will let us use MOVDDUP etc. The | |||
| 39221 | // instructions are no slower than UNPCKLPD but has the option to | |||
| 39222 | // fold the input operand into even an unaligned memory load. | |||
| 39223 | if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { | |||
| 39224 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG, V1)) { | |||
| 39225 | Shuffle = X86ISD::MOVDDUP; | |||
| 39226 | SrcVT = DstVT = MVT::v2f64; | |||
| 39227 | return true; | |||
| 39228 | } | |||
| 39229 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { | |||
| 39230 | Shuffle = X86ISD::MOVSLDUP; | |||
| 39231 | SrcVT = DstVT = MVT::v4f32; | |||
| 39232 | return true; | |||
| 39233 | } | |||
| 39234 | if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3}, DAG, V1)) { | |||
| 39235 | Shuffle = X86ISD::MOVSHDUP; | |||
| 39236 | SrcVT = DstVT = MVT::v4f32; | |||
| 39237 | return true; | |||
| 39238 | } | |||
| 39239 | } | |||
| 39240 | ||||
| 39241 | if (MaskVT.is256BitVector() && AllowFloatDomain) { | |||
| 39242 | assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX required for 256-bit vector shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39242, __extension__ __PRETTY_FUNCTION__)); | |||
| 39243 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2}, DAG, V1)) { | |||
| 39244 | Shuffle = X86ISD::MOVDDUP; | |||
| 39245 | SrcVT = DstVT = MVT::v4f64; | |||
| 39246 | return true; | |||
| 39247 | } | |||
| 39248 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, | |||
| 39249 | V1)) { | |||
| 39250 | Shuffle = X86ISD::MOVSLDUP; | |||
| 39251 | SrcVT = DstVT = MVT::v8f32; | |||
| 39252 | return true; | |||
| 39253 | } | |||
| 39254 | if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1, 3, 3, 5, 5, 7, 7}, DAG, | |||
| 39255 | V1)) { | |||
| 39256 | Shuffle = X86ISD::MOVSHDUP; | |||
| 39257 | SrcVT = DstVT = MVT::v8f32; | |||
| 39258 | return true; | |||
| 39259 | } | |||
| 39260 | } | |||
| 39261 | ||||
| 39262 | if (MaskVT.is512BitVector() && AllowFloatDomain) { | |||
| 39263 | assert(Subtarget.hasAVX512() &&(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__ __PRETTY_FUNCTION__)) | |||
| 39264 | "AVX512 required for 512-bit vector shuffles")(static_cast <bool> (Subtarget.hasAVX512() && "AVX512 required for 512-bit vector shuffles" ) ? void (0) : __assert_fail ("Subtarget.hasAVX512() && \"AVX512 required for 512-bit vector shuffles\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39264, __extension__ __PRETTY_FUNCTION__)); | |||
| 39265 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0, 2, 2, 4, 4, 6, 6}, DAG, | |||
| 39266 | V1)) { | |||
| 39267 | Shuffle = X86ISD::MOVDDUP; | |||
| 39268 | SrcVT = DstVT = MVT::v8f64; | |||
| 39269 | return true; | |||
| 39270 | } | |||
| 39271 | if (isTargetShuffleEquivalent( | |||
| 39272 | MaskVT, Mask, | |||
| 39273 | {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}, DAG, V1)) { | |||
| 39274 | Shuffle = X86ISD::MOVSLDUP; | |||
| 39275 | SrcVT = DstVT = MVT::v16f32; | |||
| 39276 | return true; | |||
| 39277 | } | |||
| 39278 | if (isTargetShuffleEquivalent( | |||
| 39279 | MaskVT, Mask, | |||
| 39280 | {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15}, DAG, V1)) { | |||
| 39281 | Shuffle = X86ISD::MOVSHDUP; | |||
| 39282 | SrcVT = DstVT = MVT::v16f32; | |||
| 39283 | return true; | |||
| 39284 | } | |||
| 39285 | } | |||
| 39286 | ||||
| 39287 | return false; | |||
| 39288 | } | |||
| 39289 | ||||
| 39290 | // Attempt to match a combined shuffle mask against supported unary immediate | |||
| 39291 | // permute instructions. | |||
| 39292 | // TODO: Investigate sharing more of this with shuffle lowering. | |||
| 39293 | static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask, | |||
| 39294 | const APInt &Zeroable, | |||
| 39295 | bool AllowFloatDomain, bool AllowIntDomain, | |||
| 39296 | const SelectionDAG &DAG, | |||
| 39297 | const X86Subtarget &Subtarget, | |||
| 39298 | unsigned &Shuffle, MVT &ShuffleVT, | |||
| 39299 | unsigned &PermuteImm) { | |||
| 39300 | unsigned NumMaskElts = Mask.size(); | |||
| 39301 | unsigned InputSizeInBits = MaskVT.getSizeInBits(); | |||
| 39302 | unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; | |||
| 39303 | MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); | |||
| 39304 | bool ContainsZeros = isAnyZero(Mask); | |||
| 39305 | ||||
| 39306 | // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. | |||
| 39307 | if (!ContainsZeros && MaskScalarSizeInBits == 64) { | |||
| 39308 | // Check for lane crossing permutes. | |||
| 39309 | if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { | |||
| 39310 | // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). | |||
| 39311 | if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { | |||
| 39312 | Shuffle = X86ISD::VPERMI; | |||
| 39313 | ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); | |||
| 39314 | PermuteImm = getV4X86ShuffleImm(Mask); | |||
| 39315 | return true; | |||
| 39316 | } | |||
| 39317 | if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { | |||
| 39318 | SmallVector<int, 4> RepeatedMask; | |||
| 39319 | if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { | |||
| 39320 | Shuffle = X86ISD::VPERMI; | |||
| 39321 | ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); | |||
| 39322 | PermuteImm = getV4X86ShuffleImm(RepeatedMask); | |||
| 39323 | return true; | |||
| 39324 | } | |||
| 39325 | } | |||
| 39326 | } else if (AllowFloatDomain && Subtarget.hasAVX()) { | |||
| 39327 | // VPERMILPD can permute with a non-repeating shuffle. | |||
| 39328 | Shuffle = X86ISD::VPERMILPI; | |||
| 39329 | ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); | |||
| 39330 | PermuteImm = 0; | |||
| 39331 | for (int i = 0, e = Mask.size(); i != e; ++i) { | |||
| 39332 | int M = Mask[i]; | |||
| 39333 | if (M == SM_SentinelUndef) | |||
| 39334 | continue; | |||
| 39335 | assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index")(static_cast <bool> (((M / 2) == (i / 2)) && "Out of range shuffle mask index" ) ? void (0) : __assert_fail ("((M / 2) == (i / 2)) && \"Out of range shuffle mask index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39335, __extension__ __PRETTY_FUNCTION__)); | |||
| 39336 | PermuteImm |= (M & 1) << i; | |||
| 39337 | } | |||
| 39338 | return true; | |||
| 39339 | } | |||
| 39340 | } | |||
| 39341 | ||||
| 39342 | // We are checking for shuffle match or shift match. Loop twice so we can | |||
| 39343 | // order which we try and match first depending on target preference. | |||
| 39344 | for (unsigned Order = 0; Order < 2; ++Order) { | |||
| 39345 | if (Subtarget.preferLowerShuffleAsShift() ? (Order == 1) : (Order == 0)) { | |||
| 39346 | // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. | |||
| 39347 | // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we | |||
| 39348 | // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). | |||
| 39349 | if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && | |||
| 39350 | !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { | |||
| 39351 | SmallVector<int, 4> RepeatedMask; | |||
| 39352 | if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { | |||
| 39353 | // Narrow the repeated mask to create 32-bit element permutes. | |||
| 39354 | SmallVector<int, 4> WordMask = RepeatedMask; | |||
| 39355 | if (MaskScalarSizeInBits == 64) | |||
| 39356 | narrowShuffleMaskElts(2, RepeatedMask, WordMask); | |||
| 39357 | ||||
| 39358 | Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); | |||
| 39359 | ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); | |||
| 39360 | ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); | |||
| 39361 | PermuteImm = getV4X86ShuffleImm(WordMask); | |||
| 39362 | return true; | |||
| 39363 | } | |||
| 39364 | } | |||
| 39365 | ||||
| 39366 | // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. | |||
| 39367 | if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16 && | |||
| 39368 | ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 39369 | (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || | |||
| 39370 | (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { | |||
| 39371 | SmallVector<int, 4> RepeatedMask; | |||
| 39372 | if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { | |||
| 39373 | ArrayRef<int> LoMask(RepeatedMask.data() + 0, 4); | |||
| 39374 | ArrayRef<int> HiMask(RepeatedMask.data() + 4, 4); | |||
| 39375 | ||||
| 39376 | // PSHUFLW: permute lower 4 elements only. | |||
| 39377 | if (isUndefOrInRange(LoMask, 0, 4) && | |||
| 39378 | isSequentialOrUndefInRange(HiMask, 0, 4, 4)) { | |||
| 39379 | Shuffle = X86ISD::PSHUFLW; | |||
| 39380 | ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); | |||
| 39381 | PermuteImm = getV4X86ShuffleImm(LoMask); | |||
| 39382 | return true; | |||
| 39383 | } | |||
| 39384 | ||||
| 39385 | // PSHUFHW: permute upper 4 elements only. | |||
| 39386 | if (isUndefOrInRange(HiMask, 4, 8) && | |||
| 39387 | isSequentialOrUndefInRange(LoMask, 0, 4, 0)) { | |||
| 39388 | // Offset the HiMask so that we can create the shuffle immediate. | |||
| 39389 | int OffsetHiMask[4]; | |||
| 39390 | for (int i = 0; i != 4; ++i) | |||
| 39391 | OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4); | |||
| 39392 | ||||
| 39393 | Shuffle = X86ISD::PSHUFHW; | |||
| 39394 | ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16); | |||
| 39395 | PermuteImm = getV4X86ShuffleImm(OffsetHiMask); | |||
| 39396 | return true; | |||
| 39397 | } | |||
| 39398 | } | |||
| 39399 | } | |||
| 39400 | } else { | |||
| 39401 | // Attempt to match against bit rotates. | |||
| 39402 | if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 && | |||
| 39403 | ((MaskVT.is128BitVector() && Subtarget.hasXOP()) || | |||
| 39404 | Subtarget.hasAVX512())) { | |||
| 39405 | int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits, | |||
| 39406 | Subtarget, Mask); | |||
| 39407 | if (0 < RotateAmt) { | |||
| 39408 | Shuffle = X86ISD::VROTLI; | |||
| 39409 | PermuteImm = (unsigned)RotateAmt; | |||
| 39410 | return true; | |||
| 39411 | } | |||
| 39412 | } | |||
| 39413 | } | |||
| 39414 | // Attempt to match against byte/bit shifts. | |||
| 39415 | if (AllowIntDomain && | |||
| 39416 | ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 39417 | (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || | |||
| 39418 | (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { | |||
| 39419 | int ShiftAmt = | |||
| 39420 | matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits, Mask, 0, | |||
| 39421 | Zeroable, Subtarget); | |||
| 39422 | if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() || | |||
| 39423 | 32 <= ShuffleVT.getScalarSizeInBits())) { | |||
| 39424 | // Byte shifts can be slower so only match them on second attempt. | |||
| 39425 | if (Order == 0 && | |||
| 39426 | (Shuffle == X86ISD::VSHLDQ || Shuffle == X86ISD::VSRLDQ)) | |||
| 39427 | continue; | |||
| 39428 | ||||
| 39429 | PermuteImm = (unsigned)ShiftAmt; | |||
| 39430 | return true; | |||
| 39431 | } | |||
| 39432 | ||||
| 39433 | } | |||
| 39434 | } | |||
| 39435 | ||||
| 39436 | return false; | |||
| 39437 | } | |||
| 39438 | ||||
| 39439 | // Attempt to match a combined unary shuffle mask against supported binary | |||
| 39440 | // shuffle instructions. | |||
| 39441 | // TODO: Investigate sharing more of this with shuffle lowering. | |||
| 39442 | static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask, | |||
| 39443 | bool AllowFloatDomain, bool AllowIntDomain, | |||
| 39444 | SDValue &V1, SDValue &V2, const SDLoc &DL, | |||
| 39445 | SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 39446 | unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, | |||
| 39447 | bool IsUnary) { | |||
| 39448 | unsigned NumMaskElts = Mask.size(); | |||
| 39449 | unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); | |||
| 39450 | unsigned SizeInBits = MaskVT.getSizeInBits(); | |||
| 39451 | ||||
| 39452 | if (MaskVT.is128BitVector()) { | |||
| 39453 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 0}, DAG) && | |||
| 39454 | AllowFloatDomain) { | |||
| 39455 | V2 = V1; | |||
| 39456 | V1 = (SM_SentinelUndef == Mask[0] ? DAG.getUNDEF(MVT::v4f32) : V1); | |||
| 39457 | Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKL : X86ISD::MOVLHPS; | |||
| 39458 | SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; | |||
| 39459 | return true; | |||
| 39460 | } | |||
| 39461 | if (isTargetShuffleEquivalent(MaskVT, Mask, {1, 1}, DAG) && | |||
| 39462 | AllowFloatDomain) { | |||
| 39463 | V2 = V1; | |||
| 39464 | Shuffle = Subtarget.hasSSE2() ? X86ISD::UNPCKH : X86ISD::MOVHLPS; | |||
| 39465 | SrcVT = DstVT = Subtarget.hasSSE2() ? MVT::v2f64 : MVT::v4f32; | |||
| 39466 | return true; | |||
| 39467 | } | |||
| 39468 | if (isTargetShuffleEquivalent(MaskVT, Mask, {0, 3}, DAG) && | |||
| 39469 | Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { | |||
| 39470 | std::swap(V1, V2); | |||
| 39471 | Shuffle = X86ISD::MOVSD; | |||
| 39472 | SrcVT = DstVT = MVT::v2f64; | |||
| 39473 | return true; | |||
| 39474 | } | |||
| 39475 | if (isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG) && | |||
| 39476 | (AllowFloatDomain || !Subtarget.hasSSE41())) { | |||
| 39477 | Shuffle = X86ISD::MOVSS; | |||
| 39478 | SrcVT = DstVT = MVT::v4f32; | |||
| 39479 | return true; | |||
| 39480 | } | |||
| 39481 | if (isTargetShuffleEquivalent(MaskVT, Mask, {8, 1, 2, 3, 4, 5, 6, 7}, | |||
| 39482 | DAG) && | |||
| 39483 | Subtarget.hasFP16()) { | |||
| 39484 | Shuffle = X86ISD::MOVSH; | |||
| 39485 | SrcVT = DstVT = MVT::v8f16; | |||
| 39486 | return true; | |||
| 39487 | } | |||
| 39488 | } | |||
| 39489 | ||||
| 39490 | // Attempt to match against either an unary or binary PACKSS/PACKUS shuffle. | |||
| 39491 | if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) || | |||
| 39492 | ((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) || | |||
| 39493 | ((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) { | |||
| 39494 | if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, | |||
| 39495 | Subtarget)) { | |||
| 39496 | DstVT = MaskVT; | |||
| 39497 | return true; | |||
| 39498 | } | |||
| 39499 | } | |||
| 39500 | ||||
| 39501 | // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle. | |||
| 39502 | if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) || | |||
| 39503 | (MaskVT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 39504 | (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) || | |||
| 39505 | (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || | |||
| 39506 | (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { | |||
| 39507 | if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, | |||
| 39508 | Subtarget)) { | |||
| 39509 | SrcVT = DstVT = MaskVT; | |||
| 39510 | if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) | |||
| 39511 | SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); | |||
| 39512 | return true; | |||
| 39513 | } | |||
| 39514 | } | |||
| 39515 | ||||
| 39516 | // Attempt to match against a OR if we're performing a blend shuffle and the | |||
| 39517 | // non-blended source element is zero in each case. | |||
| 39518 | // TODO: Handle cases where V1/V2 sizes doesn't match SizeInBits. | |||
| 39519 | if (SizeInBits == V1.getValueSizeInBits() && | |||
| 39520 | SizeInBits == V2.getValueSizeInBits() && | |||
| 39521 | (EltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && | |||
| 39522 | (EltSizeInBits % V2.getScalarValueSizeInBits()) == 0) { | |||
| 39523 | bool IsBlend = true; | |||
| 39524 | unsigned NumV1Elts = V1.getValueType().getVectorNumElements(); | |||
| 39525 | unsigned NumV2Elts = V2.getValueType().getVectorNumElements(); | |||
| 39526 | unsigned Scale1 = NumV1Elts / NumMaskElts; | |||
| 39527 | unsigned Scale2 = NumV2Elts / NumMaskElts; | |||
| 39528 | APInt DemandedZeroV1 = APInt::getZero(NumV1Elts); | |||
| 39529 | APInt DemandedZeroV2 = APInt::getZero(NumV2Elts); | |||
| 39530 | for (unsigned i = 0; i != NumMaskElts; ++i) { | |||
| 39531 | int M = Mask[i]; | |||
| 39532 | if (M == SM_SentinelUndef) | |||
| 39533 | continue; | |||
| 39534 | if (M == SM_SentinelZero) { | |||
| 39535 | DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); | |||
| 39536 | DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); | |||
| 39537 | continue; | |||
| 39538 | } | |||
| 39539 | if (M == (int)i) { | |||
| 39540 | DemandedZeroV2.setBits(i * Scale2, (i + 1) * Scale2); | |||
| 39541 | continue; | |||
| 39542 | } | |||
| 39543 | if (M == (int)(i + NumMaskElts)) { | |||
| 39544 | DemandedZeroV1.setBits(i * Scale1, (i + 1) * Scale1); | |||
| 39545 | continue; | |||
| 39546 | } | |||
| 39547 | IsBlend = false; | |||
| 39548 | break; | |||
| 39549 | } | |||
| 39550 | if (IsBlend) { | |||
| 39551 | if (DAG.MaskedVectorIsZero(V1, DemandedZeroV1) && | |||
| 39552 | DAG.MaskedVectorIsZero(V2, DemandedZeroV2)) { | |||
| 39553 | Shuffle = ISD::OR; | |||
| 39554 | SrcVT = DstVT = MaskVT.changeTypeToInteger(); | |||
| 39555 | return true; | |||
| 39556 | } | |||
| 39557 | if (NumV1Elts == NumV2Elts && NumV1Elts == NumMaskElts) { | |||
| 39558 | // FIXME: handle mismatched sizes? | |||
| 39559 | // TODO: investigate if `ISD::OR` handling in | |||
| 39560 | // `TargetLowering::SimplifyDemandedVectorElts` can be improved instead. | |||
| 39561 | auto computeKnownBitsElementWise = [&DAG](SDValue V) { | |||
| 39562 | unsigned NumElts = V.getValueType().getVectorNumElements(); | |||
| 39563 | KnownBits Known(NumElts); | |||
| 39564 | for (unsigned EltIdx = 0; EltIdx != NumElts; ++EltIdx) { | |||
| 39565 | APInt Mask = APInt::getOneBitSet(NumElts, EltIdx); | |||
| 39566 | KnownBits PeepholeKnown = DAG.computeKnownBits(V, Mask); | |||
| 39567 | if (PeepholeKnown.isZero()) | |||
| 39568 | Known.Zero.setBit(EltIdx); | |||
| 39569 | if (PeepholeKnown.isAllOnes()) | |||
| 39570 | Known.One.setBit(EltIdx); | |||
| 39571 | } | |||
| 39572 | return Known; | |||
| 39573 | }; | |||
| 39574 | ||||
| 39575 | KnownBits V1Known = computeKnownBitsElementWise(V1); | |||
| 39576 | KnownBits V2Known = computeKnownBitsElementWise(V2); | |||
| 39577 | ||||
| 39578 | for (unsigned i = 0; i != NumMaskElts && IsBlend; ++i) { | |||
| 39579 | int M = Mask[i]; | |||
| 39580 | if (M == SM_SentinelUndef) | |||
| 39581 | continue; | |||
| 39582 | if (M == SM_SentinelZero) { | |||
| 39583 | IsBlend &= V1Known.Zero[i] && V2Known.Zero[i]; | |||
| 39584 | continue; | |||
| 39585 | } | |||
| 39586 | if (M == (int)i) { | |||
| 39587 | IsBlend &= V2Known.Zero[i] || V1Known.One[i]; | |||
| 39588 | continue; | |||
| 39589 | } | |||
| 39590 | if (M == (int)(i + NumMaskElts)) { | |||
| 39591 | IsBlend &= V1Known.Zero[i] || V2Known.One[i]; | |||
| 39592 | continue; | |||
| 39593 | } | |||
| 39594 | llvm_unreachable("will not get here.")::llvm::llvm_unreachable_internal("will not get here.", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 39594); | |||
| 39595 | } | |||
| 39596 | if (IsBlend) { | |||
| 39597 | Shuffle = ISD::OR; | |||
| 39598 | SrcVT = DstVT = MaskVT.changeTypeToInteger(); | |||
| 39599 | return true; | |||
| 39600 | } | |||
| 39601 | } | |||
| 39602 | } | |||
| 39603 | } | |||
| 39604 | ||||
| 39605 | return false; | |||
| 39606 | } | |||
| 39607 | ||||
| 39608 | static bool matchBinaryPermuteShuffle( | |||
| 39609 | MVT MaskVT, ArrayRef<int> Mask, const APInt &Zeroable, | |||
| 39610 | bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, | |||
| 39611 | const SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 39612 | unsigned &Shuffle, MVT &ShuffleVT, unsigned &PermuteImm) { | |||
| 39613 | unsigned NumMaskElts = Mask.size(); | |||
| 39614 | unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); | |||
| 39615 | ||||
| 39616 | // Attempt to match against VALIGND/VALIGNQ rotate. | |||
| 39617 | if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) && | |||
| 39618 | ((MaskVT.is128BitVector() && Subtarget.hasVLX()) || | |||
| 39619 | (MaskVT.is256BitVector() && Subtarget.hasVLX()) || | |||
| 39620 | (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { | |||
| 39621 | if (!isAnyZero(Mask)) { | |||
| 39622 | int Rotation = matchShuffleAsElementRotate(V1, V2, Mask); | |||
| 39623 | if (0 < Rotation) { | |||
| 39624 | Shuffle = X86ISD::VALIGN; | |||
| 39625 | if (EltSizeInBits == 64) | |||
| 39626 | ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64); | |||
| 39627 | else | |||
| 39628 | ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32); | |||
| 39629 | PermuteImm = Rotation; | |||
| 39630 | return true; | |||
| 39631 | } | |||
| 39632 | } | |||
| 39633 | } | |||
| 39634 | ||||
| 39635 | // Attempt to match against PALIGNR byte rotate. | |||
| 39636 | if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) || | |||
| 39637 | (MaskVT.is256BitVector() && Subtarget.hasAVX2()) || | |||
| 39638 | (MaskVT.is512BitVector() && Subtarget.hasBWI()))) { | |||
| 39639 | int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask); | |||
| 39640 | if (0 < ByteRotation) { | |||
| 39641 | Shuffle = X86ISD::PALIGNR; | |||
| 39642 | ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8); | |||
| 39643 | PermuteImm = ByteRotation; | |||
| 39644 | return true; | |||
| 39645 | } | |||
| 39646 | } | |||
| 39647 | ||||
| 39648 | // Attempt to combine to X86ISD::BLENDI. | |||
| 39649 | if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) || | |||
| 39650 | (Subtarget.hasAVX() && MaskVT.is256BitVector()))) || | |||
| 39651 | (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) { | |||
| 39652 | uint64_t BlendMask = 0; | |||
| 39653 | bool ForceV1Zero = false, ForceV2Zero = false; | |||
| 39654 | SmallVector<int, 8> TargetMask(Mask); | |||
| 39655 | if (matchShuffleAsBlend(MaskVT, V1, V2, TargetMask, Zeroable, ForceV1Zero, | |||
| 39656 | ForceV2Zero, BlendMask)) { | |||
| 39657 | if (MaskVT == MVT::v16i16) { | |||
| 39658 | // We can only use v16i16 PBLENDW if the lanes are repeated. | |||
| 39659 | SmallVector<int, 8> RepeatedMask; | |||
| 39660 | if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask, | |||
| 39661 | RepeatedMask)) { | |||
| 39662 | assert(RepeatedMask.size() == 8 &&(static_cast <bool> (RepeatedMask.size() == 8 && "Repeated mask size doesn't match!") ? void (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__ __PRETTY_FUNCTION__)) | |||
| 39663 | "Repeated mask size doesn't match!")(static_cast <bool> (RepeatedMask.size() == 8 && "Repeated mask size doesn't match!") ? void (0) : __assert_fail ("RepeatedMask.size() == 8 && \"Repeated mask size doesn't match!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39663, __extension__ __PRETTY_FUNCTION__)); | |||
| 39664 | PermuteImm = 0; | |||
| 39665 | for (int i = 0; i < 8; ++i) | |||
| 39666 | if (RepeatedMask[i] >= 8) | |||
| 39667 | PermuteImm |= 1 << i; | |||
| 39668 | V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; | |||
| 39669 | V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; | |||
| 39670 | Shuffle = X86ISD::BLENDI; | |||
| 39671 | ShuffleVT = MaskVT; | |||
| 39672 | return true; | |||
| 39673 | } | |||
| 39674 | } else { | |||
| 39675 | V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; | |||
| 39676 | V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; | |||
| 39677 | PermuteImm = (unsigned)BlendMask; | |||
| 39678 | Shuffle = X86ISD::BLENDI; | |||
| 39679 | ShuffleVT = MaskVT; | |||
| 39680 | return true; | |||
| 39681 | } | |||
| 39682 | } | |||
| 39683 | } | |||
| 39684 | ||||
| 39685 | // Attempt to combine to INSERTPS, but only if it has elements that need to | |||
| 39686 | // be set to zero. | |||
| 39687 | if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && | |||
| 39688 | MaskVT.is128BitVector() && isAnyZero(Mask) && | |||
| 39689 | matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { | |||
| 39690 | Shuffle = X86ISD::INSERTPS; | |||
| 39691 | ShuffleVT = MVT::v4f32; | |||
| 39692 | return true; | |||
| 39693 | } | |||
| 39694 | ||||
| 39695 | // Attempt to combine to SHUFPD. | |||
| 39696 | if (AllowFloatDomain && EltSizeInBits == 64 && | |||
| 39697 | ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 39698 | (MaskVT.is256BitVector() && Subtarget.hasAVX()) || | |||
| 39699 | (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { | |||
| 39700 | bool ForceV1Zero = false, ForceV2Zero = false; | |||
| 39701 | if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, | |||
| 39702 | PermuteImm, Mask, Zeroable)) { | |||
| 39703 | V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; | |||
| 39704 | V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; | |||
| 39705 | Shuffle = X86ISD::SHUFP; | |||
| 39706 | ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64); | |||
| 39707 | return true; | |||
| 39708 | } | |||
| 39709 | } | |||
| 39710 | ||||
| 39711 | // Attempt to combine to SHUFPS. | |||
| 39712 | if (AllowFloatDomain && EltSizeInBits == 32 && | |||
| 39713 | ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) || | |||
| 39714 | (MaskVT.is256BitVector() && Subtarget.hasAVX()) || | |||
| 39715 | (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { | |||
| 39716 | SmallVector<int, 4> RepeatedMask; | |||
| 39717 | if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) { | |||
| 39718 | // Match each half of the repeated mask, to determine if its just | |||
| 39719 | // referencing one of the vectors, is zeroable or entirely undef. | |||
| 39720 | auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) { | |||
| 39721 | int M0 = RepeatedMask[Offset]; | |||
| 39722 | int M1 = RepeatedMask[Offset + 1]; | |||
| 39723 | ||||
| 39724 | if (isUndefInRange(RepeatedMask, Offset, 2)) { | |||
| 39725 | return DAG.getUNDEF(MaskVT); | |||
| 39726 | } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) { | |||
| 39727 | S0 = (SM_SentinelUndef == M0 ? -1 : 0); | |||
| 39728 | S1 = (SM_SentinelUndef == M1 ? -1 : 1); | |||
| 39729 | return getZeroVector(MaskVT, Subtarget, DAG, DL); | |||
| 39730 | } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) { | |||
| 39731 | S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); | |||
| 39732 | S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); | |||
| 39733 | return V1; | |||
| 39734 | } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) { | |||
| 39735 | S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3); | |||
| 39736 | S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3); | |||
| 39737 | return V2; | |||
| 39738 | } | |||
| 39739 | ||||
| 39740 | return SDValue(); | |||
| 39741 | }; | |||
| 39742 | ||||
| 39743 | int ShufMask[4] = {-1, -1, -1, -1}; | |||
| 39744 | SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]); | |||
| 39745 | SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]); | |||
| 39746 | ||||
| 39747 | if (Lo && Hi) { | |||
| 39748 | V1 = Lo; | |||
| 39749 | V2 = Hi; | |||
| 39750 | Shuffle = X86ISD::SHUFP; | |||
| 39751 | ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32); | |||
| 39752 | PermuteImm = getV4X86ShuffleImm(ShufMask); | |||
| 39753 | return true; | |||
| 39754 | } | |||
| 39755 | } | |||
| 39756 | } | |||
| 39757 | ||||
| 39758 | // Attempt to combine to INSERTPS more generally if X86ISD::SHUFP failed. | |||
| 39759 | if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && | |||
| 39760 | MaskVT.is128BitVector() && | |||
| 39761 | matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { | |||
| 39762 | Shuffle = X86ISD::INSERTPS; | |||
| 39763 | ShuffleVT = MVT::v4f32; | |||
| 39764 | return true; | |||
| 39765 | } | |||
| 39766 | ||||
| 39767 | return false; | |||
| 39768 | } | |||
| 39769 | ||||
| 39770 | static SDValue combineX86ShuffleChainWithExtract( | |||
| 39771 | ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, | |||
| 39772 | bool HasVariableMask, bool AllowVariableCrossLaneMask, | |||
| 39773 | bool AllowVariablePerLaneMask, SelectionDAG &DAG, | |||
| 39774 | const X86Subtarget &Subtarget); | |||
| 39775 | ||||
| 39776 | /// Combine an arbitrary chain of shuffles into a single instruction if | |||
| 39777 | /// possible. | |||
| 39778 | /// | |||
| 39779 | /// This is the leaf of the recursive combine below. When we have found some | |||
| 39780 | /// chain of single-use x86 shuffle instructions and accumulated the combined | |||
| 39781 | /// shuffle mask represented by them, this will try to pattern match that mask | |||
| 39782 | /// into either a single instruction if there is a special purpose instruction | |||
| 39783 | /// for this operation, or into a PSHUFB instruction which is a fully general | |||
| 39784 | /// instruction but should only be used to replace chains over a certain depth. | |||
| 39785 | static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, | |||
| 39786 | ArrayRef<int> BaseMask, int Depth, | |||
| 39787 | bool HasVariableMask, | |||
| 39788 | bool AllowVariableCrossLaneMask, | |||
| 39789 | bool AllowVariablePerLaneMask, | |||
| 39790 | SelectionDAG &DAG, | |||
| 39791 | const X86Subtarget &Subtarget) { | |||
| 39792 | assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!")(static_cast <bool> (!BaseMask.empty() && "Cannot combine an empty shuffle mask!" ) ? void (0) : __assert_fail ("!BaseMask.empty() && \"Cannot combine an empty shuffle mask!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39792, __extension__ __PRETTY_FUNCTION__)); | |||
| 39793 | assert((Inputs.size() == 1 || Inputs.size() == 2) &&(static_cast <bool> ((Inputs.size() == 1 || Inputs.size () == 2) && "Unexpected number of shuffle inputs!") ? void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__ __PRETTY_FUNCTION__)) | |||
| 39794 | "Unexpected number of shuffle inputs!")(static_cast <bool> ((Inputs.size() == 1 || Inputs.size () == 2) && "Unexpected number of shuffle inputs!") ? void (0) : __assert_fail ("(Inputs.size() == 1 || Inputs.size() == 2) && \"Unexpected number of shuffle inputs!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39794, __extension__ __PRETTY_FUNCTION__)); | |||
| 39795 | ||||
| 39796 | SDLoc DL(Root); | |||
| 39797 | MVT RootVT = Root.getSimpleValueType(); | |||
| 39798 | unsigned RootSizeInBits = RootVT.getSizeInBits(); | |||
| 39799 | unsigned NumRootElts = RootVT.getVectorNumElements(); | |||
| 39800 | ||||
| 39801 | // Canonicalize shuffle input op to the requested type. | |||
| 39802 | auto CanonicalizeShuffleInput = [&](MVT VT, SDValue Op) { | |||
| 39803 | if (VT.getSizeInBits() > Op.getValueSizeInBits()) | |||
| 39804 | Op = widenSubVector(Op, false, Subtarget, DAG, DL, VT.getSizeInBits()); | |||
| 39805 | else if (VT.getSizeInBits() < Op.getValueSizeInBits()) | |||
| 39806 | Op = extractSubVector(Op, 0, DAG, DL, VT.getSizeInBits()); | |||
| 39807 | return DAG.getBitcast(VT, Op); | |||
| 39808 | }; | |||
| 39809 | ||||
| 39810 | // Find the inputs that enter the chain. Note that multiple uses are OK | |||
| 39811 | // here, we're not going to remove the operands we find. | |||
| 39812 | bool UnaryShuffle = (Inputs.size() == 1); | |||
| 39813 | SDValue V1 = peekThroughBitcasts(Inputs[0]); | |||
| 39814 | SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType()) | |||
| 39815 | : peekThroughBitcasts(Inputs[1])); | |||
| 39816 | ||||
| 39817 | MVT VT1 = V1.getSimpleValueType(); | |||
| 39818 | MVT VT2 = V2.getSimpleValueType(); | |||
| 39819 | assert((RootSizeInBits % VT1.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits ()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch") ? void (0) : __assert_fail ("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__ __PRETTY_FUNCTION__)) | |||
| 39820 | (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch")(static_cast <bool> ((RootSizeInBits % VT1.getSizeInBits ()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && "Vector size mismatch") ? void (0) : __assert_fail ("(RootSizeInBits % VT1.getSizeInBits()) == 0 && (RootSizeInBits % VT2.getSizeInBits()) == 0 && \"Vector size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39820, __extension__ __PRETTY_FUNCTION__)); | |||
| 39821 | ||||
| 39822 | SDValue Res; | |||
| 39823 | ||||
| 39824 | unsigned NumBaseMaskElts = BaseMask.size(); | |||
| 39825 | if (NumBaseMaskElts == 1) { | |||
| 39826 | assert(BaseMask[0] == 0 && "Invalid shuffle index found!")(static_cast <bool> (BaseMask[0] == 0 && "Invalid shuffle index found!" ) ? void (0) : __assert_fail ("BaseMask[0] == 0 && \"Invalid shuffle index found!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39826, __extension__ __PRETTY_FUNCTION__)); | |||
| 39827 | return CanonicalizeShuffleInput(RootVT, V1); | |||
| 39828 | } | |||
| 39829 | ||||
| 39830 | bool OptForSize = DAG.shouldOptForSize(); | |||
| 39831 | unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts; | |||
| 39832 | bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() || | |||
| 39833 | (RootVT.isFloatingPoint() && Depth >= 1) || | |||
| 39834 | (RootVT.is256BitVector() && !Subtarget.hasAVX2()); | |||
| 39835 | ||||
| 39836 | // Don't combine if we are a AVX512/EVEX target and the mask element size | |||
| 39837 | // is different from the root element size - this would prevent writemasks | |||
| 39838 | // from being reused. | |||
| 39839 | bool IsMaskedShuffle = false; | |||
| 39840 | if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) { | |||
| 39841 | if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT && | |||
| 39842 | Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) { | |||
| 39843 | IsMaskedShuffle = true; | |||
| 39844 | } | |||
| 39845 | } | |||
| 39846 | ||||
| 39847 | // If we are shuffling a splat (and not introducing zeros) then we can just | |||
| 39848 | // use it directly. This works for smaller elements as well as they already | |||
| 39849 | // repeat across each mask element. | |||
| 39850 | if (UnaryShuffle && !isAnyZero(BaseMask) && | |||
| 39851 | V1.getValueSizeInBits() >= RootSizeInBits && | |||
| 39852 | (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0 && | |||
| 39853 | DAG.isSplatValue(V1, /*AllowUndefs*/ false)) { | |||
| 39854 | return CanonicalizeShuffleInput(RootVT, V1); | |||
| 39855 | } | |||
| 39856 | ||||
| 39857 | SmallVector<int, 64> Mask(BaseMask); | |||
| 39858 | ||||
| 39859 | // See if the shuffle is a hidden identity shuffle - repeated args in HOPs | |||
| 39860 | // etc. can be simplified. | |||
| 39861 | if (VT1 == VT2 && VT1.getSizeInBits() == RootSizeInBits && VT1.isVector()) { | |||
| 39862 | SmallVector<int> ScaledMask, IdentityMask; | |||
| 39863 | unsigned NumElts = VT1.getVectorNumElements(); | |||
| 39864 | if (Mask.size() <= NumElts && | |||
| 39865 | scaleShuffleElements(Mask, NumElts, ScaledMask)) { | |||
| 39866 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 39867 | IdentityMask.push_back(i); | |||
| 39868 | if (isTargetShuffleEquivalent(RootVT, ScaledMask, IdentityMask, DAG, V1, | |||
| 39869 | V2)) | |||
| 39870 | return CanonicalizeShuffleInput(RootVT, V1); | |||
| 39871 | } | |||
| 39872 | } | |||
| 39873 | ||||
| 39874 | // Handle 128/256-bit lane shuffles of 512-bit vectors. | |||
| 39875 | if (RootVT.is512BitVector() && | |||
| 39876 | (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) { | |||
| 39877 | // If the upper subvectors are zeroable, then an extract+insert is more | |||
| 39878 | // optimal than using X86ISD::SHUF128. The insertion is free, even if it has | |||
| 39879 | // to zero the upper subvectors. | |||
| 39880 | if (isUndefOrZeroInRange(Mask, 1, NumBaseMaskElts - 1)) { | |||
| 39881 | if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) | |||
| 39882 | return SDValue(); // Nothing to do! | |||
| 39883 | assert(isInRange(Mask[0], 0, NumBaseMaskElts) &&(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts ) && "Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__ __PRETTY_FUNCTION__)) | |||
| 39884 | "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, NumBaseMaskElts ) && "Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, NumBaseMaskElts) && \"Unexpected lane shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39884, __extension__ __PRETTY_FUNCTION__)); | |||
| 39885 | Res = CanonicalizeShuffleInput(RootVT, V1); | |||
| 39886 | unsigned SubIdx = Mask[0] * (NumRootElts / NumBaseMaskElts); | |||
| 39887 | bool UseZero = isAnyZero(Mask); | |||
| 39888 | Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits); | |||
| 39889 | return widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits); | |||
| 39890 | } | |||
| 39891 | ||||
| 39892 | // Narrow shuffle mask to v4x128. | |||
| 39893 | SmallVector<int, 4> ScaledMask; | |||
| 39894 | assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size") ? void (0) : __assert_fail ( "(BaseMaskEltSizeInBits % 128) == 0 && \"Illegal mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39894, __extension__ __PRETTY_FUNCTION__)); | |||
| 39895 | narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, Mask, ScaledMask); | |||
| 39896 | ||||
| 39897 | // Try to lower to vshuf64x2/vshuf32x4. | |||
| 39898 | auto MatchSHUF128 = [&](MVT ShuffleVT, const SDLoc &DL, | |||
| 39899 | ArrayRef<int> ScaledMask, SDValue V1, SDValue V2, | |||
| 39900 | SelectionDAG &DAG) { | |||
| 39901 | unsigned PermMask = 0; | |||
| 39902 | // Insure elements came from the same Op. | |||
| 39903 | SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)}; | |||
| 39904 | for (int i = 0; i < 4; ++i) { | |||
| 39905 | assert(ScaledMask[i] >= -1 && "Illegal shuffle sentinel value")(static_cast <bool> (ScaledMask[i] >= -1 && "Illegal shuffle sentinel value" ) ? void (0) : __assert_fail ("ScaledMask[i] >= -1 && \"Illegal shuffle sentinel value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39905, __extension__ __PRETTY_FUNCTION__)); | |||
| 39906 | if (ScaledMask[i] < 0) | |||
| 39907 | continue; | |||
| 39908 | ||||
| 39909 | SDValue Op = ScaledMask[i] >= 4 ? V2 : V1; | |||
| 39910 | unsigned OpIndex = i / 2; | |||
| 39911 | if (Ops[OpIndex].isUndef()) | |||
| 39912 | Ops[OpIndex] = Op; | |||
| 39913 | else if (Ops[OpIndex] != Op) | |||
| 39914 | return SDValue(); | |||
| 39915 | ||||
| 39916 | // Convert the 128-bit shuffle mask selection values into 128-bit | |||
| 39917 | // selection bits defined by a vshuf64x2 instruction's immediate control | |||
| 39918 | // byte. | |||
| 39919 | PermMask |= (ScaledMask[i] % 4) << (i * 2); | |||
| 39920 | } | |||
| 39921 | ||||
| 39922 | return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT, | |||
| 39923 | CanonicalizeShuffleInput(ShuffleVT, Ops[0]), | |||
| 39924 | CanonicalizeShuffleInput(ShuffleVT, Ops[1]), | |||
| 39925 | DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 39926 | }; | |||
| 39927 | ||||
| 39928 | // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask | |||
| 39929 | // doesn't work because our mask is for 128 bits and we don't have an MVT | |||
| 39930 | // to match that. | |||
| 39931 | bool PreferPERMQ = UnaryShuffle && isUndefOrInRange(ScaledMask[0], 0, 2) && | |||
| 39932 | isUndefOrInRange(ScaledMask[1], 0, 2) && | |||
| 39933 | isUndefOrInRange(ScaledMask[2], 2, 4) && | |||
| 39934 | isUndefOrInRange(ScaledMask[3], 2, 4) && | |||
| 39935 | (ScaledMask[0] < 0 || ScaledMask[2] < 0 || | |||
| 39936 | ScaledMask[0] == (ScaledMask[2] % 2)) && | |||
| 39937 | (ScaledMask[1] < 0 || ScaledMask[3] < 0 || | |||
| 39938 | ScaledMask[1] == (ScaledMask[3] % 2)); | |||
| 39939 | ||||
| 39940 | if (!isAnyZero(ScaledMask) && !PreferPERMQ) { | |||
| 39941 | if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) | |||
| 39942 | return SDValue(); // Nothing to do! | |||
| 39943 | MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64); | |||
| 39944 | if (SDValue V = MatchSHUF128(ShuffleVT, DL, ScaledMask, V1, V2, DAG)) | |||
| 39945 | return DAG.getBitcast(RootVT, V); | |||
| 39946 | } | |||
| 39947 | } | |||
| 39948 | ||||
| 39949 | // Handle 128-bit lane shuffles of 256-bit vectors. | |||
| 39950 | if (RootVT.is256BitVector() && NumBaseMaskElts == 2) { | |||
| 39951 | // If the upper half is zeroable, then an extract+insert is more optimal | |||
| 39952 | // than using X86ISD::VPERM2X128. The insertion is free, even if it has to | |||
| 39953 | // zero the upper half. | |||
| 39954 | if (isUndefOrZero(Mask[1])) { | |||
| 39955 | if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) | |||
| 39956 | return SDValue(); // Nothing to do! | |||
| 39957 | assert(isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle")(static_cast <bool> (isInRange(Mask[0], 0, 2) && "Unexpected lane shuffle") ? void (0) : __assert_fail ("isInRange(Mask[0], 0, 2) && \"Unexpected lane shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 39957, __extension__ __PRETTY_FUNCTION__)); | |||
| 39958 | Res = CanonicalizeShuffleInput(RootVT, V1); | |||
| 39959 | Res = extract128BitVector(Res, Mask[0] * (NumRootElts / 2), DAG, DL); | |||
| 39960 | return widenSubVector(Res, Mask[1] == SM_SentinelZero, Subtarget, DAG, DL, | |||
| 39961 | 256); | |||
| 39962 | } | |||
| 39963 | ||||
| 39964 | // If we're inserting the low subvector, an insert-subvector 'concat' | |||
| 39965 | // pattern is quicker than VPERM2X128. | |||
| 39966 | // TODO: Add AVX2 support instead of VPERMQ/VPERMPD. | |||
| 39967 | if (BaseMask[0] == 0 && (BaseMask[1] == 0 || BaseMask[1] == 2) && | |||
| 39968 | !Subtarget.hasAVX2()) { | |||
| 39969 | if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR) | |||
| 39970 | return SDValue(); // Nothing to do! | |||
| 39971 | SDValue Lo = CanonicalizeShuffleInput(RootVT, V1); | |||
| 39972 | SDValue Hi = CanonicalizeShuffleInput(RootVT, BaseMask[1] == 0 ? V1 : V2); | |||
| 39973 | Hi = extractSubVector(Hi, 0, DAG, DL, 128); | |||
| 39974 | return insertSubVector(Lo, Hi, NumRootElts / 2, DAG, DL, 128); | |||
| 39975 | } | |||
| 39976 | ||||
| 39977 | if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128) | |||
| 39978 | return SDValue(); // Nothing to do! | |||
| 39979 | ||||
| 39980 | // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless | |||
| 39981 | // we need to use the zeroing feature. | |||
| 39982 | // Prefer blends for sequential shuffles unless we are optimizing for size. | |||
| 39983 | if (UnaryShuffle && | |||
| 39984 | !(Subtarget.hasAVX2() && isUndefOrInRange(Mask, 0, 2)) && | |||
| 39985 | (OptForSize || !isSequentialOrUndefOrZeroInRange(Mask, 0, 2, 0))) { | |||
| 39986 | unsigned PermMask = 0; | |||
| 39987 | PermMask |= ((Mask[0] < 0 ? 0x8 : (Mask[0] & 1)) << 0); | |||
| 39988 | PermMask |= ((Mask[1] < 0 ? 0x8 : (Mask[1] & 1)) << 4); | |||
| 39989 | return DAG.getNode( | |||
| 39990 | X86ISD::VPERM2X128, DL, RootVT, CanonicalizeShuffleInput(RootVT, V1), | |||
| 39991 | DAG.getUNDEF(RootVT), DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 39992 | } | |||
| 39993 | ||||
| 39994 | if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128) | |||
| 39995 | return SDValue(); // Nothing to do! | |||
| 39996 | ||||
| 39997 | // TODO - handle AVX512VL cases with X86ISD::SHUF128. | |||
| 39998 | if (!UnaryShuffle && !IsMaskedShuffle) { | |||
| 39999 | assert(llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) &&(static_cast <bool> (llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && "Unexpected shuffle sentinel value" ) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__ __PRETTY_FUNCTION__)) | |||
| 40000 | "Unexpected shuffle sentinel value")(static_cast <bool> (llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && "Unexpected shuffle sentinel value" ) ? void (0) : __assert_fail ("llvm::all_of(Mask, [](int M) { return 0 <= M && M < 4; }) && \"Unexpected shuffle sentinel value\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40000, __extension__ __PRETTY_FUNCTION__)); | |||
| 40001 | // Prefer blends to X86ISD::VPERM2X128. | |||
| 40002 | if (!((Mask[0] == 0 && Mask[1] == 3) || (Mask[0] == 2 && Mask[1] == 1))) { | |||
| 40003 | unsigned PermMask = 0; | |||
| 40004 | PermMask |= ((Mask[0] & 3) << 0); | |||
| 40005 | PermMask |= ((Mask[1] & 3) << 4); | |||
| 40006 | SDValue LHS = isInRange(Mask[0], 0, 2) ? V1 : V2; | |||
| 40007 | SDValue RHS = isInRange(Mask[1], 0, 2) ? V1 : V2; | |||
| 40008 | return DAG.getNode(X86ISD::VPERM2X128, DL, RootVT, | |||
| 40009 | CanonicalizeShuffleInput(RootVT, LHS), | |||
| 40010 | CanonicalizeShuffleInput(RootVT, RHS), | |||
| 40011 | DAG.getTargetConstant(PermMask, DL, MVT::i8)); | |||
| 40012 | } | |||
| 40013 | } | |||
| 40014 | } | |||
| 40015 | ||||
| 40016 | // For masks that have been widened to 128-bit elements or more, | |||
| 40017 | // narrow back down to 64-bit elements. | |||
| 40018 | if (BaseMaskEltSizeInBits > 64) { | |||
| 40019 | assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size")(static_cast <bool> ((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size") ? void (0) : __assert_fail ("(BaseMaskEltSizeInBits % 64) == 0 && \"Illegal mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40019, __extension__ __PRETTY_FUNCTION__)); | |||
| 40020 | int MaskScale = BaseMaskEltSizeInBits / 64; | |||
| 40021 | SmallVector<int, 64> ScaledMask; | |||
| 40022 | narrowShuffleMaskElts(MaskScale, Mask, ScaledMask); | |||
| 40023 | Mask = std::move(ScaledMask); | |||
| 40024 | } | |||
| 40025 | ||||
| 40026 | // For masked shuffles, we're trying to match the root width for better | |||
| 40027 | // writemask folding, attempt to scale the mask. | |||
| 40028 | // TODO - variable shuffles might need this to be widened again. | |||
| 40029 | if (IsMaskedShuffle && NumRootElts > Mask.size()) { | |||
| 40030 | assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size")(static_cast <bool> ((NumRootElts % Mask.size()) == 0 && "Illegal mask size") ? void (0) : __assert_fail ("(NumRootElts % Mask.size()) == 0 && \"Illegal mask size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40030, __extension__ __PRETTY_FUNCTION__)); | |||
| 40031 | int MaskScale = NumRootElts / Mask.size(); | |||
| 40032 | SmallVector<int, 64> ScaledMask; | |||
| 40033 | narrowShuffleMaskElts(MaskScale, Mask, ScaledMask); | |||
| 40034 | Mask = std::move(ScaledMask); | |||
| 40035 | } | |||
| 40036 | ||||
| 40037 | unsigned NumMaskElts = Mask.size(); | |||
| 40038 | unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts; | |||
| 40039 | ||||
| 40040 | // Determine the effective mask value type. | |||
| 40041 | FloatDomain &= (32 <= MaskEltSizeInBits); | |||
| 40042 | MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits) | |||
| 40043 | : MVT::getIntegerVT(MaskEltSizeInBits); | |||
| 40044 | MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts); | |||
| 40045 | ||||
| 40046 | // Only allow legal mask types. | |||
| 40047 | if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) | |||
| 40048 | return SDValue(); | |||
| 40049 | ||||
| 40050 | // Attempt to match the mask against known shuffle patterns. | |||
| 40051 | MVT ShuffleSrcVT, ShuffleVT; | |||
| 40052 | unsigned Shuffle, PermuteImm; | |||
| 40053 | ||||
| 40054 | // Which shuffle domains are permitted? | |||
| 40055 | // Permit domain crossing at higher combine depths. | |||
| 40056 | // TODO: Should we indicate which domain is preferred if both are allowed? | |||
| 40057 | bool AllowFloatDomain = FloatDomain || (Depth >= 3); | |||
| 40058 | bool AllowIntDomain = (!FloatDomain || (Depth >= 3)) && Subtarget.hasSSE2() && | |||
| 40059 | (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); | |||
| 40060 | ||||
| 40061 | // Determine zeroable mask elements. | |||
| 40062 | APInt KnownUndef, KnownZero; | |||
| 40063 | resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero); | |||
| 40064 | APInt Zeroable = KnownUndef | KnownZero; | |||
| 40065 | ||||
| 40066 | if (UnaryShuffle) { | |||
| 40067 | // Attempt to match against broadcast-from-vector. | |||
| 40068 | // Limit AVX1 to cases where we're loading+broadcasting a scalar element. | |||
| 40069 | if ((Subtarget.hasAVX2() || | |||
| 40070 | (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) && | |||
| 40071 | (!IsMaskedShuffle || NumRootElts == NumMaskElts)) { | |||
| 40072 | if (isUndefOrEqual(Mask, 0)) { | |||
| 40073 | if (V1.getValueType() == MaskVT && | |||
| 40074 | V1.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 40075 | X86::mayFoldLoad(V1.getOperand(0), Subtarget)) { | |||
| 40076 | if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) | |||
| 40077 | return SDValue(); // Nothing to do! | |||
| 40078 | Res = V1.getOperand(0); | |||
| 40079 | Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); | |||
| 40080 | return DAG.getBitcast(RootVT, Res); | |||
| 40081 | } | |||
| 40082 | if (Subtarget.hasAVX2()) { | |||
| 40083 | if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) | |||
| 40084 | return SDValue(); // Nothing to do! | |||
| 40085 | Res = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40086 | Res = DAG.getNode(X86ISD::VBROADCAST, DL, MaskVT, Res); | |||
| 40087 | return DAG.getBitcast(RootVT, Res); | |||
| 40088 | } | |||
| 40089 | } | |||
| 40090 | } | |||
| 40091 | ||||
| 40092 | if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, V1, | |||
| 40093 | DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT) && | |||
| 40094 | (!IsMaskedShuffle || | |||
| 40095 | (NumRootElts == ShuffleVT.getVectorNumElements()))) { | |||
| 40096 | if (Depth == 0 && Root.getOpcode() == Shuffle) | |||
| 40097 | return SDValue(); // Nothing to do! | |||
| 40098 | Res = CanonicalizeShuffleInput(ShuffleSrcVT, V1); | |||
| 40099 | Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); | |||
| 40100 | return DAG.getBitcast(RootVT, Res); | |||
| 40101 | } | |||
| 40102 | ||||
| 40103 | if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, | |||
| 40104 | AllowIntDomain, DAG, Subtarget, Shuffle, ShuffleVT, | |||
| 40105 | PermuteImm) && | |||
| 40106 | (!IsMaskedShuffle || | |||
| 40107 | (NumRootElts == ShuffleVT.getVectorNumElements()))) { | |||
| 40108 | if (Depth == 0 && Root.getOpcode() == Shuffle) | |||
| 40109 | return SDValue(); // Nothing to do! | |||
| 40110 | Res = CanonicalizeShuffleInput(ShuffleVT, V1); | |||
| 40111 | Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, | |||
| 40112 | DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); | |||
| 40113 | return DAG.getBitcast(RootVT, Res); | |||
| 40114 | } | |||
| 40115 | } | |||
| 40116 | ||||
| 40117 | // Attempt to combine to INSERTPS, but only if the inserted element has come | |||
| 40118 | // from a scalar. | |||
| 40119 | // TODO: Handle other insertions here as well? | |||
| 40120 | if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 && | |||
| 40121 | Subtarget.hasSSE41() && | |||
| 40122 | !isTargetShuffleEquivalent(MaskVT, Mask, {4, 1, 2, 3}, DAG)) { | |||
| 40123 | if (MaskEltSizeInBits == 32) { | |||
| 40124 | SDValue SrcV1 = V1, SrcV2 = V2; | |||
| 40125 | if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, | |||
| 40126 | DAG) && | |||
| 40127 | SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) { | |||
| 40128 | if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) | |||
| 40129 | return SDValue(); // Nothing to do! | |||
| 40130 | Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, | |||
| 40131 | CanonicalizeShuffleInput(MVT::v4f32, SrcV1), | |||
| 40132 | CanonicalizeShuffleInput(MVT::v4f32, SrcV2), | |||
| 40133 | DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); | |||
| 40134 | return DAG.getBitcast(RootVT, Res); | |||
| 40135 | } | |||
| 40136 | } | |||
| 40137 | if (MaskEltSizeInBits == 64 && | |||
| 40138 | isTargetShuffleEquivalent(MaskVT, Mask, {0, 2}, DAG) && | |||
| 40139 | V2.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 40140 | V2.getScalarValueSizeInBits() <= 32) { | |||
| 40141 | if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS) | |||
| 40142 | return SDValue(); // Nothing to do! | |||
| 40143 | PermuteImm = (/*DstIdx*/ 2 << 4) | (/*SrcIdx*/ 0 << 0); | |||
| 40144 | Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, | |||
| 40145 | CanonicalizeShuffleInput(MVT::v4f32, V1), | |||
| 40146 | CanonicalizeShuffleInput(MVT::v4f32, V2), | |||
| 40147 | DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); | |||
| 40148 | return DAG.getBitcast(RootVT, Res); | |||
| 40149 | } | |||
| 40150 | } | |||
| 40151 | ||||
| 40152 | SDValue NewV1 = V1; // Save operands in case early exit happens. | |||
| 40153 | SDValue NewV2 = V2; | |||
| 40154 | if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1, | |||
| 40155 | NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, | |||
| 40156 | ShuffleVT, UnaryShuffle) && | |||
| 40157 | (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { | |||
| 40158 | if (Depth == 0 && Root.getOpcode() == Shuffle) | |||
| 40159 | return SDValue(); // Nothing to do! | |||
| 40160 | NewV1 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV1); | |||
| 40161 | NewV2 = CanonicalizeShuffleInput(ShuffleSrcVT, NewV2); | |||
| 40162 | Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2); | |||
| 40163 | return DAG.getBitcast(RootVT, Res); | |||
| 40164 | } | |||
| 40165 | ||||
| 40166 | NewV1 = V1; // Save operands in case early exit happens. | |||
| 40167 | NewV2 = V2; | |||
| 40168 | if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, | |||
| 40169 | AllowIntDomain, NewV1, NewV2, DL, DAG, | |||
| 40170 | Subtarget, Shuffle, ShuffleVT, PermuteImm) && | |||
| 40171 | (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) { | |||
| 40172 | if (Depth == 0 && Root.getOpcode() == Shuffle) | |||
| 40173 | return SDValue(); // Nothing to do! | |||
| 40174 | NewV1 = CanonicalizeShuffleInput(ShuffleVT, NewV1); | |||
| 40175 | NewV2 = CanonicalizeShuffleInput(ShuffleVT, NewV2); | |||
| 40176 | Res = DAG.getNode(Shuffle, DL, ShuffleVT, NewV1, NewV2, | |||
| 40177 | DAG.getTargetConstant(PermuteImm, DL, MVT::i8)); | |||
| 40178 | return DAG.getBitcast(RootVT, Res); | |||
| 40179 | } | |||
| 40180 | ||||
| 40181 | // Typically from here on, we need an integer version of MaskVT. | |||
| 40182 | MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); | |||
| 40183 | IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); | |||
| 40184 | ||||
| 40185 | // Annoyingly, SSE4A instructions don't map into the above match helpers. | |||
| 40186 | if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { | |||
| 40187 | uint64_t BitLen, BitIdx; | |||
| 40188 | if (matchShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, | |||
| 40189 | Zeroable)) { | |||
| 40190 | if (Depth == 0 && Root.getOpcode() == X86ISD::EXTRQI) | |||
| 40191 | return SDValue(); // Nothing to do! | |||
| 40192 | V1 = CanonicalizeShuffleInput(IntMaskVT, V1); | |||
| 40193 | Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, | |||
| 40194 | DAG.getTargetConstant(BitLen, DL, MVT::i8), | |||
| 40195 | DAG.getTargetConstant(BitIdx, DL, MVT::i8)); | |||
| 40196 | return DAG.getBitcast(RootVT, Res); | |||
| 40197 | } | |||
| 40198 | ||||
| 40199 | if (matchShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { | |||
| 40200 | if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTQI) | |||
| 40201 | return SDValue(); // Nothing to do! | |||
| 40202 | V1 = CanonicalizeShuffleInput(IntMaskVT, V1); | |||
| 40203 | V2 = CanonicalizeShuffleInput(IntMaskVT, V2); | |||
| 40204 | Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, | |||
| 40205 | DAG.getTargetConstant(BitLen, DL, MVT::i8), | |||
| 40206 | DAG.getTargetConstant(BitIdx, DL, MVT::i8)); | |||
| 40207 | return DAG.getBitcast(RootVT, Res); | |||
| 40208 | } | |||
| 40209 | } | |||
| 40210 | ||||
| 40211 | // Match shuffle against TRUNCATE patterns. | |||
| 40212 | if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) { | |||
| 40213 | // Match against a VTRUNC instruction, accounting for src/dst sizes. | |||
| 40214 | if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable, | |||
| 40215 | Subtarget)) { | |||
| 40216 | bool IsTRUNCATE = ShuffleVT.getVectorNumElements() == | |||
| 40217 | ShuffleSrcVT.getVectorNumElements(); | |||
| 40218 | unsigned Opc = | |||
| 40219 | IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC; | |||
| 40220 | if (Depth == 0 && Root.getOpcode() == Opc) | |||
| 40221 | return SDValue(); // Nothing to do! | |||
| 40222 | V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); | |||
| 40223 | Res = DAG.getNode(Opc, DL, ShuffleVT, V1); | |||
| 40224 | if (ShuffleVT.getSizeInBits() < RootSizeInBits) | |||
| 40225 | Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits); | |||
| 40226 | return DAG.getBitcast(RootVT, Res); | |||
| 40227 | } | |||
| 40228 | ||||
| 40229 | // Do we need a more general binary truncation pattern? | |||
| 40230 | if (RootSizeInBits < 512 && | |||
| 40231 | ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) || | |||
| 40232 | (RootVT.is128BitVector() && Subtarget.hasVLX())) && | |||
| 40233 | (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) && | |||
| 40234 | isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) { | |||
| 40235 | // Bail if this was already a truncation or PACK node. | |||
| 40236 | // We sometimes fail to match PACK if we demand known undef elements. | |||
| 40237 | if (Depth == 0 && (Root.getOpcode() == ISD::TRUNCATE || | |||
| 40238 | Root.getOpcode() == X86ISD::PACKSS || | |||
| 40239 | Root.getOpcode() == X86ISD::PACKUS)) | |||
| 40240 | return SDValue(); // Nothing to do! | |||
| 40241 | ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); | |||
| 40242 | ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2); | |||
| 40243 | V1 = CanonicalizeShuffleInput(ShuffleSrcVT, V1); | |||
| 40244 | V2 = CanonicalizeShuffleInput(ShuffleSrcVT, V2); | |||
| 40245 | ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2); | |||
| 40246 | ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts); | |||
| 40247 | Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2); | |||
| 40248 | Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res); | |||
| 40249 | return DAG.getBitcast(RootVT, Res); | |||
| 40250 | } | |||
| 40251 | } | |||
| 40252 | ||||
| 40253 | // Don't try to re-form single instruction chains under any circumstances now | |||
| 40254 | // that we've done encoding canonicalization for them. | |||
| 40255 | if (Depth < 1) | |||
| 40256 | return SDValue(); | |||
| 40257 | ||||
| 40258 | // Depth threshold above which we can efficiently use variable mask shuffles. | |||
| 40259 | int VariableCrossLaneShuffleDepth = | |||
| 40260 | Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; | |||
| 40261 | int VariablePerLaneShuffleDepth = | |||
| 40262 | Subtarget.hasFastVariablePerLaneShuffle() ? 1 : 2; | |||
| 40263 | AllowVariableCrossLaneMask &= | |||
| 40264 | (Depth >= VariableCrossLaneShuffleDepth) || HasVariableMask; | |||
| 40265 | AllowVariablePerLaneMask &= | |||
| 40266 | (Depth >= VariablePerLaneShuffleDepth) || HasVariableMask; | |||
| 40267 | // VPERMI2W/VPERMI2B are 3 uops on Skylake and Icelake so we require a | |||
| 40268 | // higher depth before combining them. | |||
| 40269 | bool AllowBWIVPERMV3 = | |||
| 40270 | (Depth >= (VariableCrossLaneShuffleDepth + 2) || HasVariableMask); | |||
| 40271 | ||||
| 40272 | bool MaskContainsZeros = isAnyZero(Mask); | |||
| 40273 | ||||
| 40274 | if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { | |||
| 40275 | // If we have a single input lane-crossing shuffle then lower to VPERMV. | |||
| 40276 | if (UnaryShuffle && AllowVariableCrossLaneMask && !MaskContainsZeros) { | |||
| 40277 | if (Subtarget.hasAVX2() && | |||
| 40278 | (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) { | |||
| 40279 | SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); | |||
| 40280 | Res = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40281 | Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); | |||
| 40282 | return DAG.getBitcast(RootVT, Res); | |||
| 40283 | } | |||
| 40284 | // AVX512 variants (non-VLX will pad to 512-bit shuffles). | |||
| 40285 | if ((Subtarget.hasAVX512() && | |||
| 40286 | (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || | |||
| 40287 | MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || | |||
| 40288 | (Subtarget.hasBWI() && | |||
| 40289 | (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || | |||
| 40290 | (Subtarget.hasVBMI() && | |||
| 40291 | (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8))) { | |||
| 40292 | V1 = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40293 | V2 = DAG.getUNDEF(MaskVT); | |||
| 40294 | Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); | |||
| 40295 | return DAG.getBitcast(RootVT, Res); | |||
| 40296 | } | |||
| 40297 | } | |||
| 40298 | ||||
| 40299 | // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero | |||
| 40300 | // vector as the second source (non-VLX will pad to 512-bit shuffles). | |||
| 40301 | if (UnaryShuffle && AllowVariableCrossLaneMask && | |||
| 40302 | ((Subtarget.hasAVX512() && | |||
| 40303 | (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || | |||
| 40304 | MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || | |||
| 40305 | MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32 || | |||
| 40306 | MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || | |||
| 40307 | (Subtarget.hasBWI() && AllowBWIVPERMV3 && | |||
| 40308 | (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || | |||
| 40309 | (Subtarget.hasVBMI() && AllowBWIVPERMV3 && | |||
| 40310 | (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { | |||
| 40311 | // Adjust shuffle mask - replace SM_SentinelZero with second source index. | |||
| 40312 | for (unsigned i = 0; i != NumMaskElts; ++i) | |||
| 40313 | if (Mask[i] == SM_SentinelZero) | |||
| 40314 | Mask[i] = NumMaskElts + i; | |||
| 40315 | V1 = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40316 | V2 = getZeroVector(MaskVT, Subtarget, DAG, DL); | |||
| 40317 | Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); | |||
| 40318 | return DAG.getBitcast(RootVT, Res); | |||
| 40319 | } | |||
| 40320 | ||||
| 40321 | // If that failed and either input is extracted then try to combine as a | |||
| 40322 | // shuffle with the larger type. | |||
| 40323 | if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( | |||
| 40324 | Inputs, Root, BaseMask, Depth, HasVariableMask, | |||
| 40325 | AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, | |||
| 40326 | Subtarget)) | |||
| 40327 | return WideShuffle; | |||
| 40328 | ||||
| 40329 | // If we have a dual input lane-crossing shuffle then lower to VPERMV3, | |||
| 40330 | // (non-VLX will pad to 512-bit shuffles). | |||
| 40331 | if (AllowVariableCrossLaneMask && !MaskContainsZeros && | |||
| 40332 | ((Subtarget.hasAVX512() && | |||
| 40333 | (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || | |||
| 40334 | MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 || | |||
| 40335 | MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32 || | |||
| 40336 | MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || | |||
| 40337 | (Subtarget.hasBWI() && AllowBWIVPERMV3 && | |||
| 40338 | (MaskVT == MVT::v16i16 || MaskVT == MVT::v32i16)) || | |||
| 40339 | (Subtarget.hasVBMI() && AllowBWIVPERMV3 && | |||
| 40340 | (MaskVT == MVT::v32i8 || MaskVT == MVT::v64i8)))) { | |||
| 40341 | V1 = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40342 | V2 = CanonicalizeShuffleInput(MaskVT, V2); | |||
| 40343 | Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); | |||
| 40344 | return DAG.getBitcast(RootVT, Res); | |||
| 40345 | } | |||
| 40346 | return SDValue(); | |||
| 40347 | } | |||
| 40348 | ||||
| 40349 | // See if we can combine a single input shuffle with zeros to a bit-mask, | |||
| 40350 | // which is much simpler than any shuffle. | |||
| 40351 | if (UnaryShuffle && MaskContainsZeros && AllowVariablePerLaneMask && | |||
| 40352 | isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && | |||
| 40353 | DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { | |||
| 40354 | APInt Zero = APInt::getZero(MaskEltSizeInBits); | |||
| 40355 | APInt AllOnes = APInt::getAllOnes(MaskEltSizeInBits); | |||
| 40356 | APInt UndefElts(NumMaskElts, 0); | |||
| 40357 | SmallVector<APInt, 64> EltBits(NumMaskElts, Zero); | |||
| 40358 | for (unsigned i = 0; i != NumMaskElts; ++i) { | |||
| 40359 | int M = Mask[i]; | |||
| 40360 | if (M == SM_SentinelUndef) { | |||
| 40361 | UndefElts.setBit(i); | |||
| 40362 | continue; | |||
| 40363 | } | |||
| 40364 | if (M == SM_SentinelZero) | |||
| 40365 | continue; | |||
| 40366 | EltBits[i] = AllOnes; | |||
| 40367 | } | |||
| 40368 | SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL); | |||
| 40369 | Res = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40370 | unsigned AndOpcode = | |||
| 40371 | MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); | |||
| 40372 | Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); | |||
| 40373 | return DAG.getBitcast(RootVT, Res); | |||
| 40374 | } | |||
| 40375 | ||||
| 40376 | // If we have a single input shuffle with different shuffle patterns in the | |||
| 40377 | // the 128-bit lanes use the variable mask to VPERMILPS. | |||
| 40378 | // TODO Combine other mask types at higher depths. | |||
| 40379 | if (UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && | |||
| 40380 | ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || | |||
| 40381 | (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { | |||
| 40382 | SmallVector<SDValue, 16> VPermIdx; | |||
| 40383 | for (int M : Mask) { | |||
| 40384 | SDValue Idx = | |||
| 40385 | M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); | |||
| 40386 | VPermIdx.push_back(Idx); | |||
| 40387 | } | |||
| 40388 | SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); | |||
| 40389 | Res = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40390 | Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); | |||
| 40391 | return DAG.getBitcast(RootVT, Res); | |||
| 40392 | } | |||
| 40393 | ||||
| 40394 | // With XOP, binary shuffles of 128/256-bit floating point vectors can combine | |||
| 40395 | // to VPERMIL2PD/VPERMIL2PS. | |||
| 40396 | if (AllowVariablePerLaneMask && Subtarget.hasXOP() && | |||
| 40397 | (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || | |||
| 40398 | MaskVT == MVT::v8f32)) { | |||
| 40399 | // VPERMIL2 Operation. | |||
| 40400 | // Bits[3] - Match Bit. | |||
| 40401 | // Bits[2:1] - (Per Lane) PD Shuffle Mask. | |||
| 40402 | // Bits[2:0] - (Per Lane) PS Shuffle Mask. | |||
| 40403 | unsigned NumLanes = MaskVT.getSizeInBits() / 128; | |||
| 40404 | unsigned NumEltsPerLane = NumMaskElts / NumLanes; | |||
| 40405 | SmallVector<int, 8> VPerm2Idx; | |||
| 40406 | unsigned M2ZImm = 0; | |||
| 40407 | for (int M : Mask) { | |||
| 40408 | if (M == SM_SentinelUndef) { | |||
| 40409 | VPerm2Idx.push_back(-1); | |||
| 40410 | continue; | |||
| 40411 | } | |||
| 40412 | if (M == SM_SentinelZero) { | |||
| 40413 | M2ZImm = 2; | |||
| 40414 | VPerm2Idx.push_back(8); | |||
| 40415 | continue; | |||
| 40416 | } | |||
| 40417 | int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane); | |||
| 40418 | Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index); | |||
| 40419 | VPerm2Idx.push_back(Index); | |||
| 40420 | } | |||
| 40421 | V1 = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40422 | V2 = CanonicalizeShuffleInput(MaskVT, V2); | |||
| 40423 | SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); | |||
| 40424 | Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, | |||
| 40425 | DAG.getTargetConstant(M2ZImm, DL, MVT::i8)); | |||
| 40426 | return DAG.getBitcast(RootVT, Res); | |||
| 40427 | } | |||
| 40428 | ||||
| 40429 | // If we have 3 or more shuffle instructions or a chain involving a variable | |||
| 40430 | // mask, we can replace them with a single PSHUFB instruction profitably. | |||
| 40431 | // Intel's manuals suggest only using PSHUFB if doing so replacing 5 | |||
| 40432 | // instructions, but in practice PSHUFB tends to be *very* fast so we're | |||
| 40433 | // more aggressive. | |||
| 40434 | if (UnaryShuffle && AllowVariablePerLaneMask && | |||
| 40435 | ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || | |||
| 40436 | (RootVT.is256BitVector() && Subtarget.hasAVX2()) || | |||
| 40437 | (RootVT.is512BitVector() && Subtarget.hasBWI()))) { | |||
| 40438 | SmallVector<SDValue, 16> PSHUFBMask; | |||
| 40439 | int NumBytes = RootVT.getSizeInBits() / 8; | |||
| 40440 | int Ratio = NumBytes / NumMaskElts; | |||
| 40441 | for (int i = 0; i < NumBytes; ++i) { | |||
| 40442 | int M = Mask[i / Ratio]; | |||
| 40443 | if (M == SM_SentinelUndef) { | |||
| 40444 | PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); | |||
| 40445 | continue; | |||
| 40446 | } | |||
| 40447 | if (M == SM_SentinelZero) { | |||
| 40448 | PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); | |||
| 40449 | continue; | |||
| 40450 | } | |||
| 40451 | M = Ratio * M + i % Ratio; | |||
| 40452 | assert((M / 16) == (i / 16) && "Lane crossing detected")(static_cast <bool> ((M / 16) == (i / 16) && "Lane crossing detected" ) ? void (0) : __assert_fail ("(M / 16) == (i / 16) && \"Lane crossing detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40452, __extension__ __PRETTY_FUNCTION__)); | |||
| 40453 | PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); | |||
| 40454 | } | |||
| 40455 | MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); | |||
| 40456 | Res = CanonicalizeShuffleInput(ByteVT, V1); | |||
| 40457 | SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask); | |||
| 40458 | Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); | |||
| 40459 | return DAG.getBitcast(RootVT, Res); | |||
| 40460 | } | |||
| 40461 | ||||
| 40462 | // With XOP, if we have a 128-bit binary input shuffle we can always combine | |||
| 40463 | // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never | |||
| 40464 | // slower than PSHUFB on targets that support both. | |||
| 40465 | if (AllowVariablePerLaneMask && RootVT.is128BitVector() && | |||
| 40466 | Subtarget.hasXOP()) { | |||
| 40467 | // VPPERM Mask Operation | |||
| 40468 | // Bits[4:0] - Byte Index (0 - 31) | |||
| 40469 | // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) | |||
| 40470 | SmallVector<SDValue, 16> VPPERMMask; | |||
| 40471 | int NumBytes = 16; | |||
| 40472 | int Ratio = NumBytes / NumMaskElts; | |||
| 40473 | for (int i = 0; i < NumBytes; ++i) { | |||
| 40474 | int M = Mask[i / Ratio]; | |||
| 40475 | if (M == SM_SentinelUndef) { | |||
| 40476 | VPPERMMask.push_back(DAG.getUNDEF(MVT::i8)); | |||
| 40477 | continue; | |||
| 40478 | } | |||
| 40479 | if (M == SM_SentinelZero) { | |||
| 40480 | VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8)); | |||
| 40481 | continue; | |||
| 40482 | } | |||
| 40483 | M = Ratio * M + i % Ratio; | |||
| 40484 | VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8)); | |||
| 40485 | } | |||
| 40486 | MVT ByteVT = MVT::v16i8; | |||
| 40487 | V1 = CanonicalizeShuffleInput(ByteVT, V1); | |||
| 40488 | V2 = CanonicalizeShuffleInput(ByteVT, V2); | |||
| 40489 | SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask); | |||
| 40490 | Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); | |||
| 40491 | return DAG.getBitcast(RootVT, Res); | |||
| 40492 | } | |||
| 40493 | ||||
| 40494 | // If that failed and either input is extracted then try to combine as a | |||
| 40495 | // shuffle with the larger type. | |||
| 40496 | if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( | |||
| 40497 | Inputs, Root, BaseMask, Depth, HasVariableMask, | |||
| 40498 | AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) | |||
| 40499 | return WideShuffle; | |||
| 40500 | ||||
| 40501 | // If we have a dual input shuffle then lower to VPERMV3, | |||
| 40502 | // (non-VLX will pad to 512-bit shuffles) | |||
| 40503 | if (!UnaryShuffle && AllowVariablePerLaneMask && !MaskContainsZeros && | |||
| 40504 | ((Subtarget.hasAVX512() && | |||
| 40505 | (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v8f64 || | |||
| 40506 | MaskVT == MVT::v2i64 || MaskVT == MVT::v4i64 || MaskVT == MVT::v8i64 || | |||
| 40507 | MaskVT == MVT::v4f32 || MaskVT == MVT::v4i32 || MaskVT == MVT::v8f32 || | |||
| 40508 | MaskVT == MVT::v8i32 || MaskVT == MVT::v16f32 || | |||
| 40509 | MaskVT == MVT::v16i32)) || | |||
| 40510 | (Subtarget.hasBWI() && AllowBWIVPERMV3 && | |||
| 40511 | (MaskVT == MVT::v8i16 || MaskVT == MVT::v16i16 || | |||
| 40512 | MaskVT == MVT::v32i16)) || | |||
| 40513 | (Subtarget.hasVBMI() && AllowBWIVPERMV3 && | |||
| 40514 | (MaskVT == MVT::v16i8 || MaskVT == MVT::v32i8 || | |||
| 40515 | MaskVT == MVT::v64i8)))) { | |||
| 40516 | V1 = CanonicalizeShuffleInput(MaskVT, V1); | |||
| 40517 | V2 = CanonicalizeShuffleInput(MaskVT, V2); | |||
| 40518 | Res = lowerShuffleWithPERMV(DL, MaskVT, Mask, V1, V2, Subtarget, DAG); | |||
| 40519 | return DAG.getBitcast(RootVT, Res); | |||
| 40520 | } | |||
| 40521 | ||||
| 40522 | // Failed to find any combines. | |||
| 40523 | return SDValue(); | |||
| 40524 | } | |||
| 40525 | ||||
| 40526 | // Combine an arbitrary chain of shuffles + extract_subvectors into a single | |||
| 40527 | // instruction if possible. | |||
| 40528 | // | |||
| 40529 | // Wrapper for combineX86ShuffleChain that extends the shuffle mask to a larger | |||
| 40530 | // type size to attempt to combine: | |||
| 40531 | // shuffle(extract_subvector(x,c1),extract_subvector(y,c2),m1) | |||
| 40532 | // --> | |||
| 40533 | // extract_subvector(shuffle(x,y,m2),0) | |||
| 40534 | static SDValue combineX86ShuffleChainWithExtract( | |||
| 40535 | ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> BaseMask, int Depth, | |||
| 40536 | bool HasVariableMask, bool AllowVariableCrossLaneMask, | |||
| 40537 | bool AllowVariablePerLaneMask, SelectionDAG &DAG, | |||
| 40538 | const X86Subtarget &Subtarget) { | |||
| 40539 | unsigned NumMaskElts = BaseMask.size(); | |||
| 40540 | unsigned NumInputs = Inputs.size(); | |||
| 40541 | if (NumInputs == 0) | |||
| 40542 | return SDValue(); | |||
| 40543 | ||||
| 40544 | EVT RootVT = Root.getValueType(); | |||
| 40545 | unsigned RootSizeInBits = RootVT.getSizeInBits(); | |||
| 40546 | unsigned RootEltSizeInBits = RootSizeInBits / NumMaskElts; | |||
| 40547 | assert((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask")(static_cast <bool> ((RootSizeInBits % NumMaskElts) == 0 && "Unexpected root shuffle mask") ? void (0) : __assert_fail ("(RootSizeInBits % NumMaskElts) == 0 && \"Unexpected root shuffle mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40547, __extension__ __PRETTY_FUNCTION__)); | |||
| 40548 | ||||
| 40549 | // Peek through extract_subvector to find widest legal vector. | |||
| 40550 | // TODO: Handle ISD::TRUNCATE | |||
| 40551 | unsigned WideSizeInBits = RootSizeInBits; | |||
| 40552 | for (unsigned I = 0; I != NumInputs; ++I) { | |||
| 40553 | SDValue Input = peekThroughBitcasts(Inputs[I]); | |||
| 40554 | while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR) | |||
| 40555 | Input = peekThroughBitcasts(Input.getOperand(0)); | |||
| 40556 | if (DAG.getTargetLoweringInfo().isTypeLegal(Input.getValueType()) && | |||
| 40557 | WideSizeInBits < Input.getValueSizeInBits()) | |||
| 40558 | WideSizeInBits = Input.getValueSizeInBits(); | |||
| 40559 | } | |||
| 40560 | ||||
| 40561 | // Bail if we fail to find a source larger than the existing root. | |||
| 40562 | unsigned Scale = WideSizeInBits / RootSizeInBits; | |||
| 40563 | if (WideSizeInBits <= RootSizeInBits || | |||
| 40564 | (WideSizeInBits % RootSizeInBits) != 0) | |||
| 40565 | return SDValue(); | |||
| 40566 | ||||
| 40567 | // Create new mask for larger type. | |||
| 40568 | SmallVector<int, 64> WideMask(BaseMask); | |||
| 40569 | for (int &M : WideMask) { | |||
| 40570 | if (M < 0) | |||
| 40571 | continue; | |||
| 40572 | M = (M % NumMaskElts) + ((M / NumMaskElts) * Scale * NumMaskElts); | |||
| 40573 | } | |||
| 40574 | WideMask.append((Scale - 1) * NumMaskElts, SM_SentinelUndef); | |||
| 40575 | ||||
| 40576 | // Attempt to peek through inputs and adjust mask when we extract from an | |||
| 40577 | // upper subvector. | |||
| 40578 | int AdjustedMasks = 0; | |||
| 40579 | SmallVector<SDValue, 4> WideInputs(Inputs.begin(), Inputs.end()); | |||
| 40580 | for (unsigned I = 0; I != NumInputs; ++I) { | |||
| 40581 | SDValue &Input = WideInputs[I]; | |||
| 40582 | Input = peekThroughBitcasts(Input); | |||
| 40583 | while (Input.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 40584 | Input.getOperand(0).getValueSizeInBits() <= WideSizeInBits) { | |||
| 40585 | uint64_t Idx = Input.getConstantOperandVal(1); | |||
| 40586 | if (Idx != 0) { | |||
| 40587 | ++AdjustedMasks; | |||
| 40588 | unsigned InputEltSizeInBits = Input.getScalarValueSizeInBits(); | |||
| 40589 | Idx = (Idx * InputEltSizeInBits) / RootEltSizeInBits; | |||
| 40590 | ||||
| 40591 | int lo = I * WideMask.size(); | |||
| 40592 | int hi = (I + 1) * WideMask.size(); | |||
| 40593 | for (int &M : WideMask) | |||
| 40594 | if (lo <= M && M < hi) | |||
| 40595 | M += Idx; | |||
| 40596 | } | |||
| 40597 | Input = peekThroughBitcasts(Input.getOperand(0)); | |||
| 40598 | } | |||
| 40599 | } | |||
| 40600 | ||||
| 40601 | // Remove unused/repeated shuffle source ops. | |||
| 40602 | resolveTargetShuffleInputsAndMask(WideInputs, WideMask); | |||
| 40603 | assert(!WideInputs.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!WideInputs.empty() && "Shuffle with no inputs detected" ) ? void (0) : __assert_fail ("!WideInputs.empty() && \"Shuffle with no inputs detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40603, __extension__ __PRETTY_FUNCTION__)); | |||
| 40604 | ||||
| 40605 | // Bail if we're always extracting from the lowest subvectors, | |||
| 40606 | // combineX86ShuffleChain should match this for the current width, or the | |||
| 40607 | // shuffle still references too many inputs. | |||
| 40608 | if (AdjustedMasks == 0 || WideInputs.size() > 2) | |||
| 40609 | return SDValue(); | |||
| 40610 | ||||
| 40611 | // Minor canonicalization of the accumulated shuffle mask to make it easier | |||
| 40612 | // to match below. All this does is detect masks with sequential pairs of | |||
| 40613 | // elements, and shrink them to the half-width mask. It does this in a loop | |||
| 40614 | // so it will reduce the size of the mask to the minimal width mask which | |||
| 40615 | // performs an equivalent shuffle. | |||
| 40616 | while (WideMask.size() > 1) { | |||
| 40617 | SmallVector<int, 64> WidenedMask; | |||
| 40618 | if (!canWidenShuffleElements(WideMask, WidenedMask)) | |||
| 40619 | break; | |||
| 40620 | WideMask = std::move(WidenedMask); | |||
| 40621 | } | |||
| 40622 | ||||
| 40623 | // Canonicalization of binary shuffle masks to improve pattern matching by | |||
| 40624 | // commuting the inputs. | |||
| 40625 | if (WideInputs.size() == 2 && canonicalizeShuffleMaskWithCommute(WideMask)) { | |||
| 40626 | ShuffleVectorSDNode::commuteMask(WideMask); | |||
| 40627 | std::swap(WideInputs[0], WideInputs[1]); | |||
| 40628 | } | |||
| 40629 | ||||
| 40630 | // Increase depth for every upper subvector we've peeked through. | |||
| 40631 | Depth += AdjustedMasks; | |||
| 40632 | ||||
| 40633 | // Attempt to combine wider chain. | |||
| 40634 | // TODO: Can we use a better Root? | |||
| 40635 | SDValue WideRoot = WideInputs.front().getValueSizeInBits() > | |||
| 40636 | WideInputs.back().getValueSizeInBits() | |||
| 40637 | ? WideInputs.front() | |||
| 40638 | : WideInputs.back(); | |||
| 40639 | assert(WideRoot.getValueSizeInBits() == WideSizeInBits &&(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits && "WideRootSize mismatch") ? void (0) : __assert_fail ("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__ __PRETTY_FUNCTION__)) | |||
| 40640 | "WideRootSize mismatch")(static_cast <bool> (WideRoot.getValueSizeInBits() == WideSizeInBits && "WideRootSize mismatch") ? void (0) : __assert_fail ("WideRoot.getValueSizeInBits() == WideSizeInBits && \"WideRootSize mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40640, __extension__ __PRETTY_FUNCTION__)); | |||
| 40641 | ||||
| 40642 | if (SDValue WideShuffle = | |||
| 40643 | combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, | |||
| 40644 | HasVariableMask, AllowVariableCrossLaneMask, | |||
| 40645 | AllowVariablePerLaneMask, DAG, Subtarget)) { | |||
| 40646 | WideShuffle = | |||
| 40647 | extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); | |||
| 40648 | return DAG.getBitcast(RootVT, WideShuffle); | |||
| 40649 | } | |||
| 40650 | ||||
| 40651 | return SDValue(); | |||
| 40652 | } | |||
| 40653 | ||||
| 40654 | // Canonicalize the combined shuffle mask chain with horizontal ops. | |||
| 40655 | // NOTE: This may update the Ops and Mask. | |||
| 40656 | static SDValue canonicalizeShuffleMaskWithHorizOp( | |||
| 40657 | MutableArrayRef<SDValue> Ops, MutableArrayRef<int> Mask, | |||
| 40658 | unsigned RootSizeInBits, const SDLoc &DL, SelectionDAG &DAG, | |||
| 40659 | const X86Subtarget &Subtarget) { | |||
| 40660 | if (Mask.empty() || Ops.empty()) | |||
| 40661 | return SDValue(); | |||
| 40662 | ||||
| 40663 | SmallVector<SDValue> BC; | |||
| 40664 | for (SDValue Op : Ops) | |||
| 40665 | BC.push_back(peekThroughBitcasts(Op)); | |||
| 40666 | ||||
| 40667 | // All ops must be the same horizop + type. | |||
| 40668 | SDValue BC0 = BC[0]; | |||
| 40669 | EVT VT0 = BC0.getValueType(); | |||
| 40670 | unsigned Opcode0 = BC0.getOpcode(); | |||
| 40671 | if (VT0.getSizeInBits() != RootSizeInBits || llvm::any_of(BC, [&](SDValue V) { | |||
| 40672 | return V.getOpcode() != Opcode0 || V.getValueType() != VT0; | |||
| 40673 | })) | |||
| 40674 | return SDValue(); | |||
| 40675 | ||||
| 40676 | bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || | |||
| 40677 | Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB); | |||
| 40678 | bool isPack = (Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS); | |||
| 40679 | if (!isHoriz && !isPack) | |||
| 40680 | return SDValue(); | |||
| 40681 | ||||
| 40682 | // Do all ops have a single use? | |||
| 40683 | bool OneUseOps = llvm::all_of(Ops, [](SDValue Op) { | |||
| 40684 | return Op.hasOneUse() && | |||
| 40685 | peekThroughBitcasts(Op) == peekThroughOneUseBitcasts(Op); | |||
| 40686 | }); | |||
| 40687 | ||||
| 40688 | int NumElts = VT0.getVectorNumElements(); | |||
| 40689 | int NumLanes = VT0.getSizeInBits() / 128; | |||
| 40690 | int NumEltsPerLane = NumElts / NumLanes; | |||
| 40691 | int NumHalfEltsPerLane = NumEltsPerLane / 2; | |||
| 40692 | MVT SrcVT = BC0.getOperand(0).getSimpleValueType(); | |||
| 40693 | unsigned EltSizeInBits = RootSizeInBits / Mask.size(); | |||
| 40694 | ||||
| 40695 | if (NumEltsPerLane >= 4 && | |||
| 40696 | (isPack || shouldUseHorizontalOp(Ops.size() == 1, DAG, Subtarget))) { | |||
| 40697 | SmallVector<int> LaneMask, ScaledMask; | |||
| 40698 | if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, LaneMask) && | |||
| 40699 | scaleShuffleElements(LaneMask, 4, ScaledMask)) { | |||
| 40700 | // See if we can remove the shuffle by resorting the HOP chain so that | |||
| 40701 | // the HOP args are pre-shuffled. | |||
| 40702 | // TODO: Generalize to any sized/depth chain. | |||
| 40703 | // TODO: Add support for PACKSS/PACKUS. | |||
| 40704 | if (isHoriz) { | |||
| 40705 | // Attempt to find a HOP(HOP(X,Y),HOP(Z,W)) source operand. | |||
| 40706 | auto GetHOpSrc = [&](int M) { | |||
| 40707 | if (M == SM_SentinelUndef) | |||
| 40708 | return DAG.getUNDEF(VT0); | |||
| 40709 | if (M == SM_SentinelZero) | |||
| 40710 | return getZeroVector(VT0.getSimpleVT(), Subtarget, DAG, DL); | |||
| 40711 | SDValue Src0 = BC[M / 4]; | |||
| 40712 | SDValue Src1 = Src0.getOperand((M % 4) >= 2); | |||
| 40713 | if (Src1.getOpcode() == Opcode0 && Src0->isOnlyUserOf(Src1.getNode())) | |||
| 40714 | return Src1.getOperand(M % 2); | |||
| 40715 | return SDValue(); | |||
| 40716 | }; | |||
| 40717 | SDValue M0 = GetHOpSrc(ScaledMask[0]); | |||
| 40718 | SDValue M1 = GetHOpSrc(ScaledMask[1]); | |||
| 40719 | SDValue M2 = GetHOpSrc(ScaledMask[2]); | |||
| 40720 | SDValue M3 = GetHOpSrc(ScaledMask[3]); | |||
| 40721 | if (M0 && M1 && M2 && M3) { | |||
| 40722 | SDValue LHS = DAG.getNode(Opcode0, DL, SrcVT, M0, M1); | |||
| 40723 | SDValue RHS = DAG.getNode(Opcode0, DL, SrcVT, M2, M3); | |||
| 40724 | return DAG.getNode(Opcode0, DL, VT0, LHS, RHS); | |||
| 40725 | } | |||
| 40726 | } | |||
| 40727 | // shuffle(hop(x,y),hop(z,w)) -> permute(hop(x,z)) etc. | |||
| 40728 | if (Ops.size() >= 2) { | |||
| 40729 | SDValue LHS, RHS; | |||
| 40730 | auto GetHOpSrc = [&](int M, int &OutM) { | |||
| 40731 | // TODO: Support SM_SentinelZero | |||
| 40732 | if (M < 0) | |||
| 40733 | return M == SM_SentinelUndef; | |||
| 40734 | SDValue Src = BC[M / 4].getOperand((M % 4) >= 2); | |||
| 40735 | if (!LHS || LHS == Src) { | |||
| 40736 | LHS = Src; | |||
| 40737 | OutM = (M % 2); | |||
| 40738 | return true; | |||
| 40739 | } | |||
| 40740 | if (!RHS || RHS == Src) { | |||
| 40741 | RHS = Src; | |||
| 40742 | OutM = (M % 2) + 2; | |||
| 40743 | return true; | |||
| 40744 | } | |||
| 40745 | return false; | |||
| 40746 | }; | |||
| 40747 | int PostMask[4] = {-1, -1, -1, -1}; | |||
| 40748 | if (GetHOpSrc(ScaledMask[0], PostMask[0]) && | |||
| 40749 | GetHOpSrc(ScaledMask[1], PostMask[1]) && | |||
| 40750 | GetHOpSrc(ScaledMask[2], PostMask[2]) && | |||
| 40751 | GetHOpSrc(ScaledMask[3], PostMask[3])) { | |||
| 40752 | LHS = DAG.getBitcast(SrcVT, LHS); | |||
| 40753 | RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS); | |||
| 40754 | SDValue Res = DAG.getNode(Opcode0, DL, VT0, LHS, RHS); | |||
| 40755 | // Use SHUFPS for the permute so this will work on SSE3 targets, | |||
| 40756 | // shuffle combining and domain handling will simplify this later on. | |||
| 40757 | MVT ShuffleVT = MVT::getVectorVT(MVT::f32, RootSizeInBits / 32); | |||
| 40758 | Res = DAG.getBitcast(ShuffleVT, Res); | |||
| 40759 | return DAG.getNode(X86ISD::SHUFP, DL, ShuffleVT, Res, Res, | |||
| 40760 | getV4X86ShuffleImm8ForMask(PostMask, DL, DAG)); | |||
| 40761 | } | |||
| 40762 | } | |||
| 40763 | } | |||
| 40764 | } | |||
| 40765 | ||||
| 40766 | if (2 < Ops.size()) | |||
| 40767 | return SDValue(); | |||
| 40768 | ||||
| 40769 | SDValue BC1 = BC[BC.size() - 1]; | |||
| 40770 | if (Mask.size() == VT0.getVectorNumElements()) { | |||
| 40771 | // Canonicalize binary shuffles of horizontal ops that use the | |||
| 40772 | // same sources to an unary shuffle. | |||
| 40773 | // TODO: Try to perform this fold even if the shuffle remains. | |||
| 40774 | if (Ops.size() == 2) { | |||
| 40775 | auto ContainsOps = [](SDValue HOp, SDValue Op) { | |||
| 40776 | return Op == HOp.getOperand(0) || Op == HOp.getOperand(1); | |||
| 40777 | }; | |||
| 40778 | // Commute if all BC0's ops are contained in BC1. | |||
| 40779 | if (ContainsOps(BC1, BC0.getOperand(0)) && | |||
| 40780 | ContainsOps(BC1, BC0.getOperand(1))) { | |||
| 40781 | ShuffleVectorSDNode::commuteMask(Mask); | |||
| 40782 | std::swap(Ops[0], Ops[1]); | |||
| 40783 | std::swap(BC0, BC1); | |||
| 40784 | } | |||
| 40785 | ||||
| 40786 | // If BC1 can be represented by BC0, then convert to unary shuffle. | |||
| 40787 | if (ContainsOps(BC0, BC1.getOperand(0)) && | |||
| 40788 | ContainsOps(BC0, BC1.getOperand(1))) { | |||
| 40789 | for (int &M : Mask) { | |||
| 40790 | if (M < NumElts) // BC0 element or UNDEF/Zero sentinel. | |||
| 40791 | continue; | |||
| 40792 | int SubLane = ((M % NumEltsPerLane) >= NumHalfEltsPerLane) ? 1 : 0; | |||
| 40793 | M -= NumElts + (SubLane * NumHalfEltsPerLane); | |||
| 40794 | if (BC1.getOperand(SubLane) != BC0.getOperand(0)) | |||
| 40795 | M += NumHalfEltsPerLane; | |||
| 40796 | } | |||
| 40797 | } | |||
| 40798 | } | |||
| 40799 | ||||
| 40800 | // Canonicalize unary horizontal ops to only refer to lower halves. | |||
| 40801 | for (int i = 0; i != NumElts; ++i) { | |||
| 40802 | int &M = Mask[i]; | |||
| 40803 | if (isUndefOrZero(M)) | |||
| 40804 | continue; | |||
| 40805 | if (M < NumElts && BC0.getOperand(0) == BC0.getOperand(1) && | |||
| 40806 | (M % NumEltsPerLane) >= NumHalfEltsPerLane) | |||
| 40807 | M -= NumHalfEltsPerLane; | |||
| 40808 | if (NumElts <= M && BC1.getOperand(0) == BC1.getOperand(1) && | |||
| 40809 | (M % NumEltsPerLane) >= NumHalfEltsPerLane) | |||
| 40810 | M -= NumHalfEltsPerLane; | |||
| 40811 | } | |||
| 40812 | } | |||
| 40813 | ||||
| 40814 | // Combine binary shuffle of 2 similar 'Horizontal' instructions into a | |||
| 40815 | // single instruction. Attempt to match a v2X64 repeating shuffle pattern that | |||
| 40816 | // represents the LHS/RHS inputs for the lower/upper halves. | |||
| 40817 | SmallVector<int, 16> TargetMask128, WideMask128; | |||
| 40818 | if (isRepeatedTargetShuffleMask(128, EltSizeInBits, Mask, TargetMask128) && | |||
| 40819 | scaleShuffleElements(TargetMask128, 2, WideMask128)) { | |||
| 40820 | assert(isUndefOrZeroOrInRange(WideMask128, 0, 4) && "Illegal shuffle")(static_cast <bool> (isUndefOrZeroOrInRange(WideMask128 , 0, 4) && "Illegal shuffle") ? void (0) : __assert_fail ("isUndefOrZeroOrInRange(WideMask128, 0, 4) && \"Illegal shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40820, __extension__ __PRETTY_FUNCTION__)); | |||
| 40821 | bool SingleOp = (Ops.size() == 1); | |||
| 40822 | if (isPack || OneUseOps || | |||
| 40823 | shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) { | |||
| 40824 | SDValue Lo = isInRange(WideMask128[0], 0, 2) ? BC0 : BC1; | |||
| 40825 | SDValue Hi = isInRange(WideMask128[1], 0, 2) ? BC0 : BC1; | |||
| 40826 | Lo = Lo.getOperand(WideMask128[0] & 1); | |||
| 40827 | Hi = Hi.getOperand(WideMask128[1] & 1); | |||
| 40828 | if (SingleOp) { | |||
| 40829 | SDValue Undef = DAG.getUNDEF(SrcVT); | |||
| 40830 | SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL); | |||
| 40831 | Lo = (WideMask128[0] == SM_SentinelZero ? Zero : Lo); | |||
| 40832 | Hi = (WideMask128[1] == SM_SentinelZero ? Zero : Hi); | |||
| 40833 | Lo = (WideMask128[0] == SM_SentinelUndef ? Undef : Lo); | |||
| 40834 | Hi = (WideMask128[1] == SM_SentinelUndef ? Undef : Hi); | |||
| 40835 | } | |||
| 40836 | return DAG.getNode(Opcode0, DL, VT0, Lo, Hi); | |||
| 40837 | } | |||
| 40838 | } | |||
| 40839 | ||||
| 40840 | return SDValue(); | |||
| 40841 | } | |||
| 40842 | ||||
| 40843 | // Attempt to constant fold all of the constant source ops. | |||
| 40844 | // Returns true if the entire shuffle is folded to a constant. | |||
| 40845 | // TODO: Extend this to merge multiple constant Ops and update the mask. | |||
| 40846 | static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops, | |||
| 40847 | ArrayRef<int> Mask, SDValue Root, | |||
| 40848 | bool HasVariableMask, | |||
| 40849 | SelectionDAG &DAG, | |||
| 40850 | const X86Subtarget &Subtarget) { | |||
| 40851 | MVT VT = Root.getSimpleValueType(); | |||
| 40852 | ||||
| 40853 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 40854 | unsigned NumMaskElts = Mask.size(); | |||
| 40855 | unsigned MaskSizeInBits = SizeInBits / NumMaskElts; | |||
| 40856 | unsigned NumOps = Ops.size(); | |||
| 40857 | ||||
| 40858 | // Extract constant bits from each source op. | |||
| 40859 | SmallVector<APInt, 16> UndefEltsOps(NumOps); | |||
| 40860 | SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps); | |||
| 40861 | for (unsigned I = 0; I != NumOps; ++I) | |||
| 40862 | if (!getTargetConstantBitsFromNode(Ops[I], MaskSizeInBits, UndefEltsOps[I], | |||
| 40863 | RawBitsOps[I])) | |||
| 40864 | return SDValue(); | |||
| 40865 | ||||
| 40866 | // If we're optimizing for size, only fold if at least one of the constants is | |||
| 40867 | // only used once or the combined shuffle has included a variable mask | |||
| 40868 | // shuffle, this is to avoid constant pool bloat. | |||
| 40869 | bool IsOptimizingSize = DAG.shouldOptForSize(); | |||
| 40870 | if (IsOptimizingSize && !HasVariableMask && | |||
| 40871 | llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); })) | |||
| 40872 | return SDValue(); | |||
| 40873 | ||||
| 40874 | // Shuffle the constant bits according to the mask. | |||
| 40875 | SDLoc DL(Root); | |||
| 40876 | APInt UndefElts(NumMaskElts, 0); | |||
| 40877 | APInt ZeroElts(NumMaskElts, 0); | |||
| 40878 | APInt ConstantElts(NumMaskElts, 0); | |||
| 40879 | SmallVector<APInt, 8> ConstantBitData(NumMaskElts, | |||
| 40880 | APInt::getZero(MaskSizeInBits)); | |||
| 40881 | for (unsigned i = 0; i != NumMaskElts; ++i) { | |||
| 40882 | int M = Mask[i]; | |||
| 40883 | if (M == SM_SentinelUndef) { | |||
| 40884 | UndefElts.setBit(i); | |||
| 40885 | continue; | |||
| 40886 | } else if (M == SM_SentinelZero) { | |||
| 40887 | ZeroElts.setBit(i); | |||
| 40888 | continue; | |||
| 40889 | } | |||
| 40890 | assert(0 <= M && M < (int)(NumMaskElts * NumOps))(static_cast <bool> (0 <= M && M < (int)( NumMaskElts * NumOps)) ? void (0) : __assert_fail ("0 <= M && M < (int)(NumMaskElts * NumOps)" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40890, __extension__ __PRETTY_FUNCTION__)); | |||
| 40891 | ||||
| 40892 | unsigned SrcOpIdx = (unsigned)M / NumMaskElts; | |||
| 40893 | unsigned SrcMaskIdx = (unsigned)M % NumMaskElts; | |||
| 40894 | ||||
| 40895 | auto &SrcUndefElts = UndefEltsOps[SrcOpIdx]; | |||
| 40896 | if (SrcUndefElts[SrcMaskIdx]) { | |||
| 40897 | UndefElts.setBit(i); | |||
| 40898 | continue; | |||
| 40899 | } | |||
| 40900 | ||||
| 40901 | auto &SrcEltBits = RawBitsOps[SrcOpIdx]; | |||
| 40902 | APInt &Bits = SrcEltBits[SrcMaskIdx]; | |||
| 40903 | if (!Bits) { | |||
| 40904 | ZeroElts.setBit(i); | |||
| 40905 | continue; | |||
| 40906 | } | |||
| 40907 | ||||
| 40908 | ConstantElts.setBit(i); | |||
| 40909 | ConstantBitData[i] = Bits; | |||
| 40910 | } | |||
| 40911 | assert((UndefElts | ZeroElts | ConstantElts).isAllOnes())(static_cast <bool> ((UndefElts | ZeroElts | ConstantElts ).isAllOnes()) ? void (0) : __assert_fail ("(UndefElts | ZeroElts | ConstantElts).isAllOnes()" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40911, __extension__ __PRETTY_FUNCTION__)); | |||
| 40912 | ||||
| 40913 | // Attempt to create a zero vector. | |||
| 40914 | if ((UndefElts | ZeroElts).isAllOnes()) | |||
| 40915 | return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL); | |||
| 40916 | ||||
| 40917 | // Create the constant data. | |||
| 40918 | MVT MaskSVT; | |||
| 40919 | if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64)) | |||
| 40920 | MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits); | |||
| 40921 | else | |||
| 40922 | MaskSVT = MVT::getIntegerVT(MaskSizeInBits); | |||
| 40923 | ||||
| 40924 | MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts); | |||
| 40925 | if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) | |||
| 40926 | return SDValue(); | |||
| 40927 | ||||
| 40928 | SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); | |||
| 40929 | return DAG.getBitcast(VT, CstOp); | |||
| 40930 | } | |||
| 40931 | ||||
| 40932 | namespace llvm { | |||
| 40933 | namespace X86 { | |||
| 40934 | enum { | |||
| 40935 | MaxShuffleCombineDepth = 8 | |||
| 40936 | }; | |||
| 40937 | } | |||
| 40938 | } // namespace llvm | |||
| 40939 | ||||
| 40940 | /// Fully generic combining of x86 shuffle instructions. | |||
| 40941 | /// | |||
| 40942 | /// This should be the last combine run over the x86 shuffle instructions. Once | |||
| 40943 | /// they have been fully optimized, this will recursively consider all chains | |||
| 40944 | /// of single-use shuffle instructions, build a generic model of the cumulative | |||
| 40945 | /// shuffle operation, and check for simpler instructions which implement this | |||
| 40946 | /// operation. We use this primarily for two purposes: | |||
| 40947 | /// | |||
| 40948 | /// 1) Collapse generic shuffles to specialized single instructions when | |||
| 40949 | /// equivalent. In most cases, this is just an encoding size win, but | |||
| 40950 | /// sometimes we will collapse multiple generic shuffles into a single | |||
| 40951 | /// special-purpose shuffle. | |||
| 40952 | /// 2) Look for sequences of shuffle instructions with 3 or more total | |||
| 40953 | /// instructions, and replace them with the slightly more expensive SSSE3 | |||
| 40954 | /// PSHUFB instruction if available. We do this as the last combining step | |||
| 40955 | /// to ensure we avoid using PSHUFB if we can implement the shuffle with | |||
| 40956 | /// a suitable short sequence of other instructions. The PSHUFB will either | |||
| 40957 | /// use a register or have to read from memory and so is slightly (but only | |||
| 40958 | /// slightly) more expensive than the other shuffle instructions. | |||
| 40959 | /// | |||
| 40960 | /// Because this is inherently a quadratic operation (for each shuffle in | |||
| 40961 | /// a chain, we recurse up the chain), the depth is limited to 8 instructions. | |||
| 40962 | /// This should never be an issue in practice as the shuffle lowering doesn't | |||
| 40963 | /// produce sequences of more than 8 instructions. | |||
| 40964 | /// | |||
| 40965 | /// FIXME: We will currently miss some cases where the redundant shuffling | |||
| 40966 | /// would simplify under the threshold for PSHUFB formation because of | |||
| 40967 | /// combine-ordering. To fix this, we should do the redundant instruction | |||
| 40968 | /// combining in this recursive walk. | |||
| 40969 | static SDValue combineX86ShufflesRecursively( | |||
| 40970 | ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, | |||
| 40971 | ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth, | |||
| 40972 | unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, | |||
| 40973 | bool AllowVariablePerLaneMask, SelectionDAG &DAG, | |||
| 40974 | const X86Subtarget &Subtarget) { | |||
| 40975 | assert(!RootMask.empty() &&(static_cast <bool> (!RootMask.empty() && (RootMask .size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0 )) && "Illegal shuffle root mask") ? void (0) : __assert_fail ("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__ __PRETTY_FUNCTION__)) | |||
| 40976 | (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&(static_cast <bool> (!RootMask.empty() && (RootMask .size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0 )) && "Illegal shuffle root mask") ? void (0) : __assert_fail ("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__ __PRETTY_FUNCTION__)) | |||
| 40977 | "Illegal shuffle root mask")(static_cast <bool> (!RootMask.empty() && (RootMask .size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0 )) && "Illegal shuffle root mask") ? void (0) : __assert_fail ("!RootMask.empty() && (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) && \"Illegal shuffle root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40977, __extension__ __PRETTY_FUNCTION__)); | |||
| 40978 | MVT RootVT = Root.getSimpleValueType(); | |||
| 40979 | assert(RootVT.isVector() && "Shuffles operate on vector types!")(static_cast <bool> (RootVT.isVector() && "Shuffles operate on vector types!" ) ? void (0) : __assert_fail ("RootVT.isVector() && \"Shuffles operate on vector types!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 40979, __extension__ __PRETTY_FUNCTION__)); | |||
| 40980 | unsigned RootSizeInBits = RootVT.getSizeInBits(); | |||
| 40981 | ||||
| 40982 | // Bound the depth of our recursive combine because this is ultimately | |||
| 40983 | // quadratic in nature. | |||
| 40984 | if (Depth >= MaxDepth) | |||
| 40985 | return SDValue(); | |||
| 40986 | ||||
| 40987 | // Directly rip through bitcasts to find the underlying operand. | |||
| 40988 | SDValue Op = SrcOps[SrcOpIndex]; | |||
| 40989 | Op = peekThroughOneUseBitcasts(Op); | |||
| 40990 | ||||
| 40991 | EVT VT = Op.getValueType(); | |||
| 40992 | if (!VT.isVector() || !VT.isSimple()) | |||
| 40993 | return SDValue(); // Bail if we hit a non-simple non-vector. | |||
| 40994 | ||||
| 40995 | // FIXME: Just bail on f16 for now. | |||
| 40996 | if (VT.getVectorElementType() == MVT::f16) | |||
| 40997 | return SDValue(); | |||
| 40998 | ||||
| 40999 | assert((RootSizeInBits % VT.getSizeInBits()) == 0 &&(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits ()) == 0 && "Can only combine shuffles upto size of the root op." ) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__ __PRETTY_FUNCTION__)) | |||
| 41000 | "Can only combine shuffles upto size of the root op.")(static_cast <bool> ((RootSizeInBits % VT.getSizeInBits ()) == 0 && "Can only combine shuffles upto size of the root op." ) ? void (0) : __assert_fail ("(RootSizeInBits % VT.getSizeInBits()) == 0 && \"Can only combine shuffles upto size of the root op.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41000, __extension__ __PRETTY_FUNCTION__)); | |||
| 41001 | ||||
| 41002 | // Create a demanded elts mask from the referenced elements of Op. | |||
| 41003 | APInt OpDemandedElts = APInt::getZero(RootMask.size()); | |||
| 41004 | for (int M : RootMask) { | |||
| 41005 | int BaseIdx = RootMask.size() * SrcOpIndex; | |||
| 41006 | if (isInRange(M, BaseIdx, BaseIdx + RootMask.size())) | |||
| 41007 | OpDemandedElts.setBit(M - BaseIdx); | |||
| 41008 | } | |||
| 41009 | if (RootSizeInBits != VT.getSizeInBits()) { | |||
| 41010 | // Op is smaller than Root - extract the demanded elts for the subvector. | |||
| 41011 | unsigned Scale = RootSizeInBits / VT.getSizeInBits(); | |||
| 41012 | unsigned NumOpMaskElts = RootMask.size() / Scale; | |||
| 41013 | assert((RootMask.size() % Scale) == 0 && "Root mask size mismatch")(static_cast <bool> ((RootMask.size() % Scale) == 0 && "Root mask size mismatch") ? void (0) : __assert_fail ("(RootMask.size() % Scale) == 0 && \"Root mask size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41013, __extension__ __PRETTY_FUNCTION__)); | |||
| 41014 | assert(OpDemandedElts(static_cast <bool> (OpDemandedElts .extractBits(RootMask .size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask" ) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__ __PRETTY_FUNCTION__)) | |||
| 41015 | .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts)(static_cast <bool> (OpDemandedElts .extractBits(RootMask .size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask" ) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__ __PRETTY_FUNCTION__)) | |||
| 41016 | .isZero() &&(static_cast <bool> (OpDemandedElts .extractBits(RootMask .size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask" ) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__ __PRETTY_FUNCTION__)) | |||
| 41017 | "Out of range elements referenced in root mask")(static_cast <bool> (OpDemandedElts .extractBits(RootMask .size() - NumOpMaskElts, NumOpMaskElts) .isZero() && "Out of range elements referenced in root mask" ) ? void (0) : __assert_fail ("OpDemandedElts .extractBits(RootMask.size() - NumOpMaskElts, NumOpMaskElts) .isZero() && \"Out of range elements referenced in root mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41017, __extension__ __PRETTY_FUNCTION__)); | |||
| 41018 | OpDemandedElts = OpDemandedElts.extractBits(NumOpMaskElts, 0); | |||
| 41019 | } | |||
| 41020 | OpDemandedElts = | |||
| 41021 | APIntOps::ScaleBitMask(OpDemandedElts, VT.getVectorNumElements()); | |||
| 41022 | ||||
| 41023 | // Extract target shuffle mask and resolve sentinels and inputs. | |||
| 41024 | SmallVector<int, 64> OpMask; | |||
| 41025 | SmallVector<SDValue, 2> OpInputs; | |||
| 41026 | APInt OpUndef, OpZero; | |||
| 41027 | bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); | |||
| 41028 | if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, | |||
| 41029 | OpZero, DAG, Depth, false)) { | |||
| 41030 | // Shuffle inputs must not be larger than the shuffle result. | |||
| 41031 | // TODO: Relax this for single input faux shuffles (e.g. trunc). | |||
| 41032 | if (llvm::any_of(OpInputs, [VT](SDValue OpInput) { | |||
| 41033 | return OpInput.getValueSizeInBits() > VT.getSizeInBits(); | |||
| 41034 | })) | |||
| 41035 | return SDValue(); | |||
| 41036 | } else if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 41037 | (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && | |||
| 41038 | !isNullConstant(Op.getOperand(1))) { | |||
| 41039 | SDValue SrcVec = Op.getOperand(0); | |||
| 41040 | int ExtractIdx = Op.getConstantOperandVal(1); | |||
| 41041 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 41042 | OpInputs.assign({SrcVec}); | |||
| 41043 | OpMask.assign(NumElts, SM_SentinelUndef); | |||
| 41044 | std::iota(OpMask.begin(), OpMask.end(), ExtractIdx); | |||
| 41045 | OpZero = OpUndef = APInt::getZero(NumElts); | |||
| 41046 | } else { | |||
| 41047 | return SDValue(); | |||
| 41048 | } | |||
| 41049 | ||||
| 41050 | // If the shuffle result was smaller than the root, we need to adjust the | |||
| 41051 | // mask indices and pad the mask with undefs. | |||
| 41052 | if (RootSizeInBits > VT.getSizeInBits()) { | |||
| 41053 | unsigned NumSubVecs = RootSizeInBits / VT.getSizeInBits(); | |||
| 41054 | unsigned OpMaskSize = OpMask.size(); | |||
| 41055 | if (OpInputs.size() > 1) { | |||
| 41056 | unsigned PaddedMaskSize = NumSubVecs * OpMaskSize; | |||
| 41057 | for (int &M : OpMask) { | |||
| 41058 | if (M < 0) | |||
| 41059 | continue; | |||
| 41060 | int EltIdx = M % OpMaskSize; | |||
| 41061 | int OpIdx = M / OpMaskSize; | |||
| 41062 | M = (PaddedMaskSize * OpIdx) + EltIdx; | |||
| 41063 | } | |||
| 41064 | } | |||
| 41065 | OpZero = OpZero.zext(NumSubVecs * OpMaskSize); | |||
| 41066 | OpUndef = OpUndef.zext(NumSubVecs * OpMaskSize); | |||
| 41067 | OpMask.append((NumSubVecs - 1) * OpMaskSize, SM_SentinelUndef); | |||
| 41068 | } | |||
| 41069 | ||||
| 41070 | SmallVector<int, 64> Mask; | |||
| 41071 | SmallVector<SDValue, 16> Ops; | |||
| 41072 | ||||
| 41073 | // We don't need to merge masks if the root is empty. | |||
| 41074 | bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1); | |||
| 41075 | if (EmptyRoot) { | |||
| 41076 | // Only resolve zeros if it will remove an input, otherwise we might end | |||
| 41077 | // up in an infinite loop. | |||
| 41078 | bool ResolveKnownZeros = true; | |||
| 41079 | if (!OpZero.isZero()) { | |||
| 41080 | APInt UsedInputs = APInt::getZero(OpInputs.size()); | |||
| 41081 | for (int i = 0, e = OpMask.size(); i != e; ++i) { | |||
| 41082 | int M = OpMask[i]; | |||
| 41083 | if (OpUndef[i] || OpZero[i] || isUndefOrZero(M)) | |||
| 41084 | continue; | |||
| 41085 | UsedInputs.setBit(M / OpMask.size()); | |||
| 41086 | if (UsedInputs.isAllOnes()) { | |||
| 41087 | ResolveKnownZeros = false; | |||
| 41088 | break; | |||
| 41089 | } | |||
| 41090 | } | |||
| 41091 | } | |||
| 41092 | resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero, | |||
| 41093 | ResolveKnownZeros); | |||
| 41094 | ||||
| 41095 | Mask = OpMask; | |||
| 41096 | Ops.append(OpInputs.begin(), OpInputs.end()); | |||
| 41097 | } else { | |||
| 41098 | resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero); | |||
| 41099 | ||||
| 41100 | // Add the inputs to the Ops list, avoiding duplicates. | |||
| 41101 | Ops.append(SrcOps.begin(), SrcOps.end()); | |||
| 41102 | ||||
| 41103 | auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int { | |||
| 41104 | // Attempt to find an existing match. | |||
| 41105 | SDValue InputBC = peekThroughBitcasts(Input); | |||
| 41106 | for (int i = 0, e = Ops.size(); i < e; ++i) | |||
| 41107 | if (InputBC == peekThroughBitcasts(Ops[i])) | |||
| 41108 | return i; | |||
| 41109 | // Match failed - should we replace an existing Op? | |||
| 41110 | if (InsertionPoint >= 0) { | |||
| 41111 | Ops[InsertionPoint] = Input; | |||
| 41112 | return InsertionPoint; | |||
| 41113 | } | |||
| 41114 | // Add to the end of the Ops list. | |||
| 41115 | Ops.push_back(Input); | |||
| 41116 | return Ops.size() - 1; | |||
| 41117 | }; | |||
| 41118 | ||||
| 41119 | SmallVector<int, 2> OpInputIdx; | |||
| 41120 | for (SDValue OpInput : OpInputs) | |||
| 41121 | OpInputIdx.push_back( | |||
| 41122 | AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1)); | |||
| 41123 | ||||
| 41124 | assert(((RootMask.size() > OpMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)) | |||
| 41125 | RootMask.size() % OpMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)) | |||
| 41126 | (OpMask.size() > RootMask.size() &&(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)) | |||
| 41127 | OpMask.size() % RootMask.size() == 0) ||(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)) | |||
| 41128 | OpMask.size() == RootMask.size()) &&(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)) | |||
| 41129 | "The smaller number of elements must divide the larger.")(static_cast <bool> (((RootMask.size() > OpMask.size () && RootMask.size() % OpMask.size() == 0) || (OpMask .size() > RootMask.size() && OpMask.size() % RootMask .size() == 0) || OpMask.size() == RootMask.size()) && "The smaller number of elements must divide the larger.") ? void (0) : __assert_fail ("((RootMask.size() > OpMask.size() && RootMask.size() % OpMask.size() == 0) || (OpMask.size() > RootMask.size() && OpMask.size() % RootMask.size() == 0) || OpMask.size() == RootMask.size()) && \"The smaller number of elements must divide the larger.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41129, __extension__ __PRETTY_FUNCTION__)); | |||
| 41130 | ||||
| 41131 | // This function can be performance-critical, so we rely on the power-of-2 | |||
| 41132 | // knowledge that we have about the mask sizes to replace div/rem ops with | |||
| 41133 | // bit-masks and shifts. | |||
| 41134 | assert(llvm::has_single_bit<uint32_t>(RootMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t> (RootMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__ __PRETTY_FUNCTION__)) | |||
| 41135 | "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t> (RootMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(RootMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41135, __extension__ __PRETTY_FUNCTION__)); | |||
| 41136 | assert(llvm::has_single_bit<uint32_t>(OpMask.size()) &&(static_cast <bool> (llvm::has_single_bit<uint32_t> (OpMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__ __PRETTY_FUNCTION__)) | |||
| 41137 | "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (llvm::has_single_bit<uint32_t> (OpMask.size()) && "Non-power-of-2 shuffle mask sizes" ) ? void (0) : __assert_fail ("llvm::has_single_bit<uint32_t>(OpMask.size()) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41137, __extension__ __PRETTY_FUNCTION__)); | |||
| 41138 | unsigned RootMaskSizeLog2 = llvm::countr_zero(RootMask.size()); | |||
| 41139 | unsigned OpMaskSizeLog2 = llvm::countr_zero(OpMask.size()); | |||
| 41140 | ||||
| 41141 | unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size()); | |||
| 41142 | unsigned RootRatio = | |||
| 41143 | std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2); | |||
| 41144 | unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2); | |||
| 41145 | assert((RootRatio == 1 || OpRatio == 1) &&(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!") ? void (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__ __PRETTY_FUNCTION__)) | |||
| 41146 | "Must not have a ratio for both incoming and op masks!")(static_cast <bool> ((RootRatio == 1 || OpRatio == 1) && "Must not have a ratio for both incoming and op masks!") ? void (0) : __assert_fail ("(RootRatio == 1 || OpRatio == 1) && \"Must not have a ratio for both incoming and op masks!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41146, __extension__ __PRETTY_FUNCTION__)); | |||
| 41147 | ||||
| 41148 | assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail ("isPowerOf2_32(MaskWidth) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41148, __extension__ __PRETTY_FUNCTION__)); | |||
| 41149 | assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes") ? void (0) : __assert_fail ("isPowerOf2_32(RootRatio) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41149, __extension__ __PRETTY_FUNCTION__)); | |||
| 41150 | assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes")(static_cast <bool> (isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes" ) ? void (0) : __assert_fail ("isPowerOf2_32(OpRatio) && \"Non-power-of-2 shuffle mask sizes\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41150, __extension__ __PRETTY_FUNCTION__)); | |||
| 41151 | unsigned RootRatioLog2 = llvm::countr_zero(RootRatio); | |||
| 41152 | unsigned OpRatioLog2 = llvm::countr_zero(OpRatio); | |||
| 41153 | ||||
| 41154 | Mask.resize(MaskWidth, SM_SentinelUndef); | |||
| 41155 | ||||
| 41156 | // Merge this shuffle operation's mask into our accumulated mask. Note that | |||
| 41157 | // this shuffle's mask will be the first applied to the input, followed by | |||
| 41158 | // the root mask to get us all the way to the root value arrangement. The | |||
| 41159 | // reason for this order is that we are recursing up the operation chain. | |||
| 41160 | for (unsigned i = 0; i < MaskWidth; ++i) { | |||
| 41161 | unsigned RootIdx = i >> RootRatioLog2; | |||
| 41162 | if (RootMask[RootIdx] < 0) { | |||
| 41163 | // This is a zero or undef lane, we're done. | |||
| 41164 | Mask[i] = RootMask[RootIdx]; | |||
| 41165 | continue; | |||
| 41166 | } | |||
| 41167 | ||||
| 41168 | unsigned RootMaskedIdx = | |||
| 41169 | RootRatio == 1 | |||
| 41170 | ? RootMask[RootIdx] | |||
| 41171 | : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1)); | |||
| 41172 | ||||
| 41173 | // Just insert the scaled root mask value if it references an input other | |||
| 41174 | // than the SrcOp we're currently inserting. | |||
| 41175 | if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) || | |||
| 41176 | (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) { | |||
| 41177 | Mask[i] = RootMaskedIdx; | |||
| 41178 | continue; | |||
| 41179 | } | |||
| 41180 | ||||
| 41181 | RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1); | |||
| 41182 | unsigned OpIdx = RootMaskedIdx >> OpRatioLog2; | |||
| 41183 | if (OpMask[OpIdx] < 0) { | |||
| 41184 | // The incoming lanes are zero or undef, it doesn't matter which ones we | |||
| 41185 | // are using. | |||
| 41186 | Mask[i] = OpMask[OpIdx]; | |||
| 41187 | continue; | |||
| 41188 | } | |||
| 41189 | ||||
| 41190 | // Ok, we have non-zero lanes, map them through to one of the Op's inputs. | |||
| 41191 | unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx] | |||
| 41192 | : (OpMask[OpIdx] << OpRatioLog2) + | |||
| 41193 | (RootMaskedIdx & (OpRatio - 1)); | |||
| 41194 | ||||
| 41195 | OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1); | |||
| 41196 | int InputIdx = OpMask[OpIdx] / (int)OpMask.size(); | |||
| 41197 | assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input")(static_cast <bool> (0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input") ? void (0) : __assert_fail ( "0 <= OpInputIdx[InputIdx] && \"Unknown target shuffle input\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41197, __extension__ __PRETTY_FUNCTION__)); | |||
| 41198 | OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth; | |||
| 41199 | ||||
| 41200 | Mask[i] = OpMaskedIdx; | |||
| 41201 | } | |||
| 41202 | } | |||
| 41203 | ||||
| 41204 | // Peek through vector widenings and set out of bounds mask indices to undef. | |||
| 41205 | // TODO: Can resolveTargetShuffleInputsAndMask do some of this? | |||
| 41206 | for (unsigned I = 0, E = Ops.size(); I != E; ++I) { | |||
| 41207 | SDValue &Op = Ops[I]; | |||
| 41208 | if (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op.getOperand(0).isUndef() && | |||
| 41209 | isNullConstant(Op.getOperand(2))) { | |||
| 41210 | Op = Op.getOperand(1); | |||
| 41211 | unsigned Scale = RootSizeInBits / Op.getValueSizeInBits(); | |||
| 41212 | int Lo = I * Mask.size(); | |||
| 41213 | int Hi = (I + 1) * Mask.size(); | |||
| 41214 | int NewHi = Lo + (Mask.size() / Scale); | |||
| 41215 | for (int &M : Mask) { | |||
| 41216 | if (Lo <= M && NewHi <= M && M < Hi) | |||
| 41217 | M = SM_SentinelUndef; | |||
| 41218 | } | |||
| 41219 | } | |||
| 41220 | } | |||
| 41221 | ||||
| 41222 | // Peek through any free extract_subvector nodes back to root size. | |||
| 41223 | for (SDValue &Op : Ops) | |||
| 41224 | while (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 41225 | (RootSizeInBits % Op.getOperand(0).getValueSizeInBits()) == 0 && | |||
| 41226 | isNullConstant(Op.getOperand(1))) | |||
| 41227 | Op = Op.getOperand(0); | |||
| 41228 | ||||
| 41229 | // Remove unused/repeated shuffle source ops. | |||
| 41230 | resolveTargetShuffleInputsAndMask(Ops, Mask); | |||
| 41231 | ||||
| 41232 | // Handle the all undef/zero/ones cases early. | |||
| 41233 | if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) | |||
| 41234 | return DAG.getUNDEF(RootVT); | |||
| 41235 | if (all_of(Mask, [](int Idx) { return Idx < 0; })) | |||
| 41236 | return getZeroVector(RootVT, Subtarget, DAG, SDLoc(Root)); | |||
| 41237 | if (Ops.size() == 1 && ISD::isBuildVectorAllOnes(Ops[0].getNode()) && | |||
| 41238 | !llvm::is_contained(Mask, SM_SentinelZero)) | |||
| 41239 | return getOnesVector(RootVT, DAG, SDLoc(Root)); | |||
| 41240 | ||||
| 41241 | assert(!Ops.empty() && "Shuffle with no inputs detected")(static_cast <bool> (!Ops.empty() && "Shuffle with no inputs detected" ) ? void (0) : __assert_fail ("!Ops.empty() && \"Shuffle with no inputs detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41241, __extension__ __PRETTY_FUNCTION__)); | |||
| 41242 | HasVariableMask |= IsOpVariableMask; | |||
| 41243 | ||||
| 41244 | // Update the list of shuffle nodes that have been combined so far. | |||
| 41245 | SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(), | |||
| 41246 | SrcNodes.end()); | |||
| 41247 | CombinedNodes.push_back(Op.getNode()); | |||
| 41248 | ||||
| 41249 | // See if we can recurse into each shuffle source op (if it's a target | |||
| 41250 | // shuffle). The source op should only be generally combined if it either has | |||
| 41251 | // a single use (i.e. current Op) or all its users have already been combined, | |||
| 41252 | // if not then we can still combine but should prevent generation of variable | |||
| 41253 | // shuffles to avoid constant pool bloat. | |||
| 41254 | // Don't recurse if we already have more source ops than we can combine in | |||
| 41255 | // the remaining recursion depth. | |||
| 41256 | if (Ops.size() < (MaxDepth - Depth)) { | |||
| 41257 | for (int i = 0, e = Ops.size(); i < e; ++i) { | |||
| 41258 | // For empty roots, we need to resolve zeroable elements before combining | |||
| 41259 | // them with other shuffles. | |||
| 41260 | SmallVector<int, 64> ResolvedMask = Mask; | |||
| 41261 | if (EmptyRoot) | |||
| 41262 | resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero); | |||
| 41263 | bool AllowCrossLaneVar = false; | |||
| 41264 | bool AllowPerLaneVar = false; | |||
| 41265 | if (Ops[i].getNode()->hasOneUse() || | |||
| 41266 | SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) { | |||
| 41267 | AllowCrossLaneVar = AllowVariableCrossLaneMask; | |||
| 41268 | AllowPerLaneVar = AllowVariablePerLaneMask; | |||
| 41269 | } | |||
| 41270 | if (SDValue Res = combineX86ShufflesRecursively( | |||
| 41271 | Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, | |||
| 41272 | HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, | |||
| 41273 | Subtarget)) | |||
| 41274 | return Res; | |||
| 41275 | } | |||
| 41276 | } | |||
| 41277 | ||||
| 41278 | // Attempt to constant fold all of the constant source ops. | |||
| 41279 | if (SDValue Cst = combineX86ShufflesConstants( | |||
| 41280 | Ops, Mask, Root, HasVariableMask, DAG, Subtarget)) | |||
| 41281 | return Cst; | |||
| 41282 | ||||
| 41283 | // If constant fold failed and we only have constants - then we have | |||
| 41284 | // multiple uses by a single non-variable shuffle - just bail. | |||
| 41285 | if (Depth == 0 && llvm::all_of(Ops, [&](SDValue Op) { | |||
| 41286 | APInt UndefElts; | |||
| 41287 | SmallVector<APInt> RawBits; | |||
| 41288 | unsigned EltSizeInBits = RootSizeInBits / Mask.size(); | |||
| 41289 | return getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, | |||
| 41290 | RawBits); | |||
| 41291 | })) { | |||
| 41292 | return SDValue(); | |||
| 41293 | } | |||
| 41294 | ||||
| 41295 | // Canonicalize the combined shuffle mask chain with horizontal ops. | |||
| 41296 | // NOTE: This will update the Ops and Mask. | |||
| 41297 | if (SDValue HOp = canonicalizeShuffleMaskWithHorizOp( | |||
| 41298 | Ops, Mask, RootSizeInBits, SDLoc(Root), DAG, Subtarget)) | |||
| 41299 | return DAG.getBitcast(RootVT, HOp); | |||
| 41300 | ||||
| 41301 | // Try to refine our inputs given our knowledge of target shuffle mask. | |||
| 41302 | for (auto I : enumerate(Ops)) { | |||
| 41303 | int OpIdx = I.index(); | |||
| 41304 | SDValue &Op = I.value(); | |||
| 41305 | ||||
| 41306 | // What range of shuffle mask element values results in picking from Op? | |||
| 41307 | int Lo = OpIdx * Mask.size(); | |||
| 41308 | int Hi = Lo + Mask.size(); | |||
| 41309 | ||||
| 41310 | // Which elements of Op do we demand, given the mask's granularity? | |||
| 41311 | APInt OpDemandedElts(Mask.size(), 0); | |||
| 41312 | for (int MaskElt : Mask) { | |||
| 41313 | if (isInRange(MaskElt, Lo, Hi)) { // Picks from Op? | |||
| 41314 | int OpEltIdx = MaskElt - Lo; | |||
| 41315 | OpDemandedElts.setBit(OpEltIdx); | |||
| 41316 | } | |||
| 41317 | } | |||
| 41318 | ||||
| 41319 | // Is the shuffle result smaller than the root? | |||
| 41320 | if (Op.getValueSizeInBits() < RootSizeInBits) { | |||
| 41321 | // We padded the mask with undefs. But we now need to undo that. | |||
| 41322 | unsigned NumExpectedVectorElts = Mask.size(); | |||
| 41323 | unsigned EltSizeInBits = RootSizeInBits / NumExpectedVectorElts; | |||
| 41324 | unsigned NumOpVectorElts = Op.getValueSizeInBits() / EltSizeInBits; | |||
| 41325 | assert(!OpDemandedElts.extractBits((static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?" ) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__ __PRETTY_FUNCTION__)) | |||
| 41326 | NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) &&(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?" ) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__ __PRETTY_FUNCTION__)) | |||
| 41327 | "Demanding the virtual undef widening padding?")(static_cast <bool> (!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && "Demanding the virtual undef widening padding?" ) ? void (0) : __assert_fail ("!OpDemandedElts.extractBits( NumExpectedVectorElts - NumOpVectorElts, NumOpVectorElts) && \"Demanding the virtual undef widening padding?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41327, __extension__ __PRETTY_FUNCTION__)); | |||
| 41328 | OpDemandedElts = OpDemandedElts.trunc(NumOpVectorElts); // NUW | |||
| 41329 | } | |||
| 41330 | ||||
| 41331 | // The Op itself may be of different VT, so we need to scale the mask. | |||
| 41332 | unsigned NumOpElts = Op.getValueType().getVectorNumElements(); | |||
| 41333 | APInt OpScaledDemandedElts = APIntOps::ScaleBitMask(OpDemandedElts, NumOpElts); | |||
| 41334 | ||||
| 41335 | // Can this operand be simplified any further, given it's demanded elements? | |||
| 41336 | if (SDValue NewOp = | |||
| 41337 | DAG.getTargetLoweringInfo().SimplifyMultipleUseDemandedVectorElts( | |||
| 41338 | Op, OpScaledDemandedElts, DAG)) | |||
| 41339 | Op = NewOp; | |||
| 41340 | } | |||
| 41341 | // FIXME: should we rerun resolveTargetShuffleInputsAndMask() now? | |||
| 41342 | ||||
| 41343 | // Widen any subvector shuffle inputs we've collected. | |||
| 41344 | // TODO: Remove this to avoid generating temporary nodes, we should only | |||
| 41345 | // widen once combineX86ShuffleChain has found a match. | |||
| 41346 | if (any_of(Ops, [RootSizeInBits](SDValue Op) { | |||
| 41347 | return Op.getValueSizeInBits() < RootSizeInBits; | |||
| 41348 | })) { | |||
| 41349 | for (SDValue &Op : Ops) | |||
| 41350 | if (Op.getValueSizeInBits() < RootSizeInBits) | |||
| 41351 | Op = widenSubVector(Op, false, Subtarget, DAG, SDLoc(Op), | |||
| 41352 | RootSizeInBits); | |||
| 41353 | // Reresolve - we might have repeated subvector sources. | |||
| 41354 | resolveTargetShuffleInputsAndMask(Ops, Mask); | |||
| 41355 | } | |||
| 41356 | ||||
| 41357 | // We can only combine unary and binary shuffle mask cases. | |||
| 41358 | if (Ops.size() <= 2) { | |||
| 41359 | // Minor canonicalization of the accumulated shuffle mask to make it easier | |||
| 41360 | // to match below. All this does is detect masks with sequential pairs of | |||
| 41361 | // elements, and shrink them to the half-width mask. It does this in a loop | |||
| 41362 | // so it will reduce the size of the mask to the minimal width mask which | |||
| 41363 | // performs an equivalent shuffle. | |||
| 41364 | while (Mask.size() > 1) { | |||
| 41365 | SmallVector<int, 64> WidenedMask; | |||
| 41366 | if (!canWidenShuffleElements(Mask, WidenedMask)) | |||
| 41367 | break; | |||
| 41368 | Mask = std::move(WidenedMask); | |||
| 41369 | } | |||
| 41370 | ||||
| 41371 | // Canonicalization of binary shuffle masks to improve pattern matching by | |||
| 41372 | // commuting the inputs. | |||
| 41373 | if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) { | |||
| 41374 | ShuffleVectorSDNode::commuteMask(Mask); | |||
| 41375 | std::swap(Ops[0], Ops[1]); | |||
| 41376 | } | |||
| 41377 | ||||
| 41378 | // Try to combine into a single shuffle instruction. | |||
| 41379 | if (SDValue Shuffle = combineX86ShuffleChain( | |||
| 41380 | Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, | |||
| 41381 | AllowVariablePerLaneMask, DAG, Subtarget)) | |||
| 41382 | return Shuffle; | |||
| 41383 | ||||
| 41384 | // If all the operands come from the same larger vector, fallthrough and try | |||
| 41385 | // to use combineX86ShuffleChainWithExtract. | |||
| 41386 | SDValue LHS = peekThroughBitcasts(Ops.front()); | |||
| 41387 | SDValue RHS = peekThroughBitcasts(Ops.back()); | |||
| 41388 | if (Ops.size() != 2 || !Subtarget.hasAVX2() || RootSizeInBits != 128 || | |||
| 41389 | (RootSizeInBits / Mask.size()) != 64 || | |||
| 41390 | LHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 41391 | RHS.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 41392 | LHS.getOperand(0) != RHS.getOperand(0)) | |||
| 41393 | return SDValue(); | |||
| 41394 | } | |||
| 41395 | ||||
| 41396 | // If that failed and any input is extracted then try to combine as a | |||
| 41397 | // shuffle with the larger type. | |||
| 41398 | return combineX86ShuffleChainWithExtract( | |||
| 41399 | Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, | |||
| 41400 | AllowVariablePerLaneMask, DAG, Subtarget); | |||
| 41401 | } | |||
| 41402 | ||||
| 41403 | /// Helper entry wrapper to combineX86ShufflesRecursively. | |||
| 41404 | static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, | |||
| 41405 | const X86Subtarget &Subtarget) { | |||
| 41406 | return combineX86ShufflesRecursively( | |||
| 41407 | {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, | |||
| 41408 | /*HasVarMask*/ false, | |||
| 41409 | /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, | |||
| 41410 | Subtarget); | |||
| 41411 | } | |||
| 41412 | ||||
| 41413 | /// Get the PSHUF-style mask from PSHUF node. | |||
| 41414 | /// | |||
| 41415 | /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4 | |||
| 41416 | /// PSHUF-style masks that can be reused with such instructions. | |||
| 41417 | static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) { | |||
| 41418 | MVT VT = N.getSimpleValueType(); | |||
| 41419 | SmallVector<int, 4> Mask; | |||
| 41420 | SmallVector<SDValue, 2> Ops; | |||
| 41421 | bool HaveMask = | |||
| 41422 | getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask); | |||
| 41423 | (void)HaveMask; | |||
| 41424 | assert(HaveMask)(static_cast <bool> (HaveMask) ? void (0) : __assert_fail ("HaveMask", "llvm/lib/Target/X86/X86ISelLowering.cpp", 41424 , __extension__ __PRETTY_FUNCTION__)); | |||
| 41425 | ||||
| 41426 | // If we have more than 128-bits, only the low 128-bits of shuffle mask | |||
| 41427 | // matter. Check that the upper masks are repeats and remove them. | |||
| 41428 | if (VT.getSizeInBits() > 128) { | |||
| 41429 | int LaneElts = 128 / VT.getScalarSizeInBits(); | |||
| 41430 | #ifndef NDEBUG | |||
| 41431 | for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i) | |||
| 41432 | for (int j = 0; j < LaneElts; ++j) | |||
| 41433 | assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!" ) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__ __PRETTY_FUNCTION__)) | |||
| 41434 | "Mask doesn't repeat in high 128-bit lanes!")(static_cast <bool> (Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && "Mask doesn't repeat in high 128-bit lanes!" ) ? void (0) : __assert_fail ("Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) && \"Mask doesn't repeat in high 128-bit lanes!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41434, __extension__ __PRETTY_FUNCTION__)); | |||
| 41435 | #endif | |||
| 41436 | Mask.resize(LaneElts); | |||
| 41437 | } | |||
| 41438 | ||||
| 41439 | switch (N.getOpcode()) { | |||
| 41440 | case X86ISD::PSHUFD: | |||
| 41441 | return Mask; | |||
| 41442 | case X86ISD::PSHUFLW: | |||
| 41443 | Mask.resize(4); | |||
| 41444 | return Mask; | |||
| 41445 | case X86ISD::PSHUFHW: | |||
| 41446 | Mask.erase(Mask.begin(), Mask.begin() + 4); | |||
| 41447 | for (int &M : Mask) | |||
| 41448 | M -= 4; | |||
| 41449 | return Mask; | |||
| 41450 | default: | |||
| 41451 | llvm_unreachable("No valid shuffle instruction found!")::llvm::llvm_unreachable_internal("No valid shuffle instruction found!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41451); | |||
| 41452 | } | |||
| 41453 | } | |||
| 41454 | ||||
| 41455 | /// Search for a combinable shuffle across a chain ending in pshufd. | |||
| 41456 | /// | |||
| 41457 | /// We walk up the chain and look for a combinable shuffle, skipping over | |||
| 41458 | /// shuffles that we could hoist this shuffle's transformation past without | |||
| 41459 | /// altering anything. | |||
| 41460 | static SDValue | |||
| 41461 | combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask, | |||
| 41462 | SelectionDAG &DAG) { | |||
| 41463 | assert(N.getOpcode() == X86ISD::PSHUFD &&(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!" ) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__ __PRETTY_FUNCTION__)) | |||
| 41464 | "Called with something other than an x86 128-bit half shuffle!")(static_cast <bool> (N.getOpcode() == X86ISD::PSHUFD && "Called with something other than an x86 128-bit half shuffle!" ) ? void (0) : __assert_fail ("N.getOpcode() == X86ISD::PSHUFD && \"Called with something other than an x86 128-bit half shuffle!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41464, __extension__ __PRETTY_FUNCTION__)); | |||
| 41465 | SDLoc DL(N); | |||
| 41466 | ||||
| 41467 | // Walk up a single-use chain looking for a combinable shuffle. Keep a stack | |||
| 41468 | // of the shuffles in the chain so that we can form a fresh chain to replace | |||
| 41469 | // this one. | |||
| 41470 | SmallVector<SDValue, 8> Chain; | |||
| 41471 | SDValue V = N.getOperand(0); | |||
| 41472 | for (; V.hasOneUse(); V = V.getOperand(0)) { | |||
| 41473 | switch (V.getOpcode()) { | |||
| 41474 | default: | |||
| 41475 | return SDValue(); // Nothing combined! | |||
| 41476 | ||||
| 41477 | case ISD::BITCAST: | |||
| 41478 | // Skip bitcasts as we always know the type for the target specific | |||
| 41479 | // instructions. | |||
| 41480 | continue; | |||
| 41481 | ||||
| 41482 | case X86ISD::PSHUFD: | |||
| 41483 | // Found another dword shuffle. | |||
| 41484 | break; | |||
| 41485 | ||||
| 41486 | case X86ISD::PSHUFLW: | |||
| 41487 | // Check that the low words (being shuffled) are the identity in the | |||
| 41488 | // dword shuffle, and the high words are self-contained. | |||
| 41489 | if (Mask[0] != 0 || Mask[1] != 1 || | |||
| 41490 | !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4)) | |||
| 41491 | return SDValue(); | |||
| 41492 | ||||
| 41493 | Chain.push_back(V); | |||
| 41494 | continue; | |||
| 41495 | ||||
| 41496 | case X86ISD::PSHUFHW: | |||
| 41497 | // Check that the high words (being shuffled) are the identity in the | |||
| 41498 | // dword shuffle, and the low words are self-contained. | |||
| 41499 | if (Mask[2] != 2 || Mask[3] != 3 || | |||
| 41500 | !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2)) | |||
| 41501 | return SDValue(); | |||
| 41502 | ||||
| 41503 | Chain.push_back(V); | |||
| 41504 | continue; | |||
| 41505 | ||||
| 41506 | case X86ISD::UNPCKL: | |||
| 41507 | case X86ISD::UNPCKH: | |||
| 41508 | // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword | |||
| 41509 | // shuffle into a preceding word shuffle. | |||
| 41510 | if (V.getSimpleValueType().getVectorElementType() != MVT::i8 && | |||
| 41511 | V.getSimpleValueType().getVectorElementType() != MVT::i16) | |||
| 41512 | return SDValue(); | |||
| 41513 | ||||
| 41514 | // Search for a half-shuffle which we can combine with. | |||
| 41515 | unsigned CombineOp = | |||
| 41516 | V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW; | |||
| 41517 | if (V.getOperand(0) != V.getOperand(1) || | |||
| 41518 | !V->isOnlyUserOf(V.getOperand(0).getNode())) | |||
| 41519 | return SDValue(); | |||
| 41520 | Chain.push_back(V); | |||
| 41521 | V = V.getOperand(0); | |||
| 41522 | do { | |||
| 41523 | switch (V.getOpcode()) { | |||
| 41524 | default: | |||
| 41525 | return SDValue(); // Nothing to combine. | |||
| 41526 | ||||
| 41527 | case X86ISD::PSHUFLW: | |||
| 41528 | case X86ISD::PSHUFHW: | |||
| 41529 | if (V.getOpcode() == CombineOp) | |||
| 41530 | break; | |||
| 41531 | ||||
| 41532 | Chain.push_back(V); | |||
| 41533 | ||||
| 41534 | [[fallthrough]]; | |||
| 41535 | case ISD::BITCAST: | |||
| 41536 | V = V.getOperand(0); | |||
| 41537 | continue; | |||
| 41538 | } | |||
| 41539 | break; | |||
| 41540 | } while (V.hasOneUse()); | |||
| 41541 | break; | |||
| 41542 | } | |||
| 41543 | // Break out of the loop if we break out of the switch. | |||
| 41544 | break; | |||
| 41545 | } | |||
| 41546 | ||||
| 41547 | if (!V.hasOneUse()) | |||
| 41548 | // We fell out of the loop without finding a viable combining instruction. | |||
| 41549 | return SDValue(); | |||
| 41550 | ||||
| 41551 | // Merge this node's mask and our incoming mask. | |||
| 41552 | SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); | |||
| 41553 | for (int &M : Mask) | |||
| 41554 | M = VMask[M]; | |||
| 41555 | V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0), | |||
| 41556 | getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 41557 | ||||
| 41558 | // Rebuild the chain around this new shuffle. | |||
| 41559 | while (!Chain.empty()) { | |||
| 41560 | SDValue W = Chain.pop_back_val(); | |||
| 41561 | ||||
| 41562 | if (V.getValueType() != W.getOperand(0).getValueType()) | |||
| 41563 | V = DAG.getBitcast(W.getOperand(0).getValueType(), V); | |||
| 41564 | ||||
| 41565 | switch (W.getOpcode()) { | |||
| 41566 | default: | |||
| 41567 | llvm_unreachable("Only PSHUF and UNPCK instructions get here!")::llvm::llvm_unreachable_internal("Only PSHUF and UNPCK instructions get here!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41567); | |||
| 41568 | ||||
| 41569 | case X86ISD::UNPCKL: | |||
| 41570 | case X86ISD::UNPCKH: | |||
| 41571 | V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V); | |||
| 41572 | break; | |||
| 41573 | ||||
| 41574 | case X86ISD::PSHUFD: | |||
| 41575 | case X86ISD::PSHUFLW: | |||
| 41576 | case X86ISD::PSHUFHW: | |||
| 41577 | V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1)); | |||
| 41578 | break; | |||
| 41579 | } | |||
| 41580 | } | |||
| 41581 | if (V.getValueType() != N.getValueType()) | |||
| 41582 | V = DAG.getBitcast(N.getValueType(), V); | |||
| 41583 | ||||
| 41584 | // Return the new chain to replace N. | |||
| 41585 | return V; | |||
| 41586 | } | |||
| 41587 | ||||
| 41588 | // Attempt to commute shufps LHS loads: | |||
| 41589 | // permilps(shufps(load(),x)) --> permilps(shufps(x,load())) | |||
| 41590 | static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL, | |||
| 41591 | SelectionDAG &DAG) { | |||
| 41592 | // TODO: Add vXf64 support. | |||
| 41593 | if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32) | |||
| 41594 | return SDValue(); | |||
| 41595 | ||||
| 41596 | // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not. | |||
| 41597 | auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) { | |||
| 41598 | if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode())) | |||
| 41599 | return SDValue(); | |||
| 41600 | SDValue N0 = V.getOperand(0); | |||
| 41601 | SDValue N1 = V.getOperand(1); | |||
| 41602 | unsigned Imm = V.getConstantOperandVal(2); | |||
| 41603 | const X86Subtarget &Subtarget = DAG.getSubtarget<X86Subtarget>(); | |||
| 41604 | if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || | |||
| 41605 | X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) | |||
| 41606 | return SDValue(); | |||
| 41607 | Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); | |||
| 41608 | return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, | |||
| 41609 | DAG.getTargetConstant(Imm, DL, MVT::i8)); | |||
| 41610 | }; | |||
| 41611 | ||||
| 41612 | switch (N.getOpcode()) { | |||
| 41613 | case X86ISD::VPERMILPI: | |||
| 41614 | if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) { | |||
| 41615 | unsigned Imm = N.getConstantOperandVal(1); | |||
| 41616 | return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP, | |||
| 41617 | DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); | |||
| 41618 | } | |||
| 41619 | break; | |||
| 41620 | case X86ISD::SHUFP: { | |||
| 41621 | SDValue N0 = N.getOperand(0); | |||
| 41622 | SDValue N1 = N.getOperand(1); | |||
| 41623 | unsigned Imm = N.getConstantOperandVal(2); | |||
| 41624 | if (N0 == N1) { | |||
| 41625 | if (SDValue NewSHUFP = commuteSHUFP(N, N0)) | |||
| 41626 | return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP, | |||
| 41627 | DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8)); | |||
| 41628 | } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) { | |||
| 41629 | return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1, | |||
| 41630 | DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8)); | |||
| 41631 | } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) { | |||
| 41632 | return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP, | |||
| 41633 | DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8)); | |||
| 41634 | } | |||
| 41635 | break; | |||
| 41636 | } | |||
| 41637 | } | |||
| 41638 | ||||
| 41639 | return SDValue(); | |||
| 41640 | } | |||
| 41641 | ||||
| 41642 | // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). | |||
| 41643 | static SDValue canonicalizeShuffleWithBinOps(SDValue N, SelectionDAG &DAG, | |||
| 41644 | const SDLoc &DL) { | |||
| 41645 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 41646 | EVT ShuffleVT = N.getValueType(); | |||
| 41647 | ||||
| 41648 | auto IsMergeableWithShuffle = [&DAG](SDValue Op, bool FoldLoad = false) { | |||
| 41649 | // AllZeros/AllOnes constants are freely shuffled and will peek through | |||
| 41650 | // bitcasts. Other constant build vectors do not peek through bitcasts. Only | |||
| 41651 | // merge with target shuffles if it has one use so shuffle combining is | |||
| 41652 | // likely to kick in. Shuffles of splats are expected to be removed. | |||
| 41653 | return ISD::isBuildVectorAllOnes(Op.getNode()) || | |||
| 41654 | ISD::isBuildVectorAllZeros(Op.getNode()) || | |||
| 41655 | ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) || | |||
| 41656 | ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) || | |||
| 41657 | (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) || | |||
| 41658 | (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) || | |||
| 41659 | (FoldLoad && isShuffleFoldableLoad(Op)) || | |||
| 41660 | DAG.isSplatValue(Op, /*AllowUndefs*/ false); | |||
| 41661 | }; | |||
| 41662 | auto IsSafeToMoveShuffle = [ShuffleVT](SDValue Op, unsigned BinOp) { | |||
| 41663 | // Ensure we only shuffle whole vector src elements, unless its a logical | |||
| 41664 | // binops where we can more aggressively move shuffles from dst to src. | |||
| 41665 | return BinOp == ISD::AND || BinOp == ISD::OR || BinOp == ISD::XOR || | |||
| 41666 | BinOp == X86ISD::ANDNP || | |||
| 41667 | (Op.getScalarValueSizeInBits() <= ShuffleVT.getScalarSizeInBits()); | |||
| 41668 | }; | |||
| 41669 | ||||
| 41670 | unsigned Opc = N.getOpcode(); | |||
| 41671 | switch (Opc) { | |||
| 41672 | // Unary and Unary+Permute Shuffles. | |||
| 41673 | case X86ISD::PSHUFB: { | |||
| 41674 | // Don't merge PSHUFB if it contains zero'd elements. | |||
| 41675 | SmallVector<int> Mask; | |||
| 41676 | SmallVector<SDValue> Ops; | |||
| 41677 | if (!getTargetShuffleMask(N.getNode(), ShuffleVT.getSimpleVT(), false, Ops, | |||
| 41678 | Mask)) | |||
| 41679 | break; | |||
| 41680 | [[fallthrough]]; | |||
| 41681 | } | |||
| 41682 | case X86ISD::VBROADCAST: | |||
| 41683 | case X86ISD::MOVDDUP: | |||
| 41684 | case X86ISD::PSHUFD: | |||
| 41685 | case X86ISD::PSHUFHW: | |||
| 41686 | case X86ISD::PSHUFLW: | |||
| 41687 | case X86ISD::VPERMI: | |||
| 41688 | case X86ISD::VPERMILPI: { | |||
| 41689 | if (N.getOperand(0).getValueType() == ShuffleVT && | |||
| 41690 | N->isOnlyUserOf(N.getOperand(0).getNode())) { | |||
| 41691 | SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); | |||
| 41692 | unsigned SrcOpcode = N0.getOpcode(); | |||
| 41693 | if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) { | |||
| 41694 | SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); | |||
| 41695 | SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); | |||
| 41696 | if (IsMergeableWithShuffle(Op00, Opc != X86ISD::PSHUFB) || | |||
| 41697 | IsMergeableWithShuffle(Op01, Opc != X86ISD::PSHUFB)) { | |||
| 41698 | SDValue LHS, RHS; | |||
| 41699 | Op00 = DAG.getBitcast(ShuffleVT, Op00); | |||
| 41700 | Op01 = DAG.getBitcast(ShuffleVT, Op01); | |||
| 41701 | if (N.getNumOperands() == 2) { | |||
| 41702 | LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1)); | |||
| 41703 | RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1)); | |||
| 41704 | } else { | |||
| 41705 | LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00); | |||
| 41706 | RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01); | |||
| 41707 | } | |||
| 41708 | EVT OpVT = N0.getValueType(); | |||
| 41709 | return DAG.getBitcast(ShuffleVT, | |||
| 41710 | DAG.getNode(SrcOpcode, DL, OpVT, | |||
| 41711 | DAG.getBitcast(OpVT, LHS), | |||
| 41712 | DAG.getBitcast(OpVT, RHS))); | |||
| 41713 | } | |||
| 41714 | } | |||
| 41715 | } | |||
| 41716 | break; | |||
| 41717 | } | |||
| 41718 | // Binary and Binary+Permute Shuffles. | |||
| 41719 | case X86ISD::INSERTPS: { | |||
| 41720 | // Don't merge INSERTPS if it contains zero'd elements. | |||
| 41721 | unsigned InsertPSMask = N.getConstantOperandVal(2); | |||
| 41722 | unsigned ZeroMask = InsertPSMask & 0xF; | |||
| 41723 | if (ZeroMask != 0) | |||
| 41724 | break; | |||
| 41725 | [[fallthrough]]; | |||
| 41726 | } | |||
| 41727 | case X86ISD::MOVSD: | |||
| 41728 | case X86ISD::MOVSS: | |||
| 41729 | case X86ISD::BLENDI: | |||
| 41730 | case X86ISD::SHUFP: | |||
| 41731 | case X86ISD::UNPCKH: | |||
| 41732 | case X86ISD::UNPCKL: { | |||
| 41733 | if (N->isOnlyUserOf(N.getOperand(0).getNode()) && | |||
| 41734 | N->isOnlyUserOf(N.getOperand(1).getNode())) { | |||
| 41735 | SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0)); | |||
| 41736 | SDValue N1 = peekThroughOneUseBitcasts(N.getOperand(1)); | |||
| 41737 | unsigned SrcOpcode = N0.getOpcode(); | |||
| 41738 | if (TLI.isBinOp(SrcOpcode) && N1.getOpcode() == SrcOpcode && | |||
| 41739 | IsSafeToMoveShuffle(N0, SrcOpcode) && | |||
| 41740 | IsSafeToMoveShuffle(N1, SrcOpcode)) { | |||
| 41741 | SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0)); | |||
| 41742 | SDValue Op10 = peekThroughOneUseBitcasts(N1.getOperand(0)); | |||
| 41743 | SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1)); | |||
| 41744 | SDValue Op11 = peekThroughOneUseBitcasts(N1.getOperand(1)); | |||
| 41745 | // Ensure the total number of shuffles doesn't increase by folding this | |||
| 41746 | // shuffle through to the source ops. | |||
| 41747 | if (((IsMergeableWithShuffle(Op00) && IsMergeableWithShuffle(Op10)) || | |||
| 41748 | (IsMergeableWithShuffle(Op01) && IsMergeableWithShuffle(Op11))) || | |||
| 41749 | ((IsMergeableWithShuffle(Op00) || IsMergeableWithShuffle(Op10)) && | |||
| 41750 | (IsMergeableWithShuffle(Op01) || IsMergeableWithShuffle(Op11)))) { | |||
| 41751 | SDValue LHS, RHS; | |||
| 41752 | Op00 = DAG.getBitcast(ShuffleVT, Op00); | |||
| 41753 | Op10 = DAG.getBitcast(ShuffleVT, Op10); | |||
| 41754 | Op01 = DAG.getBitcast(ShuffleVT, Op01); | |||
| 41755 | Op11 = DAG.getBitcast(ShuffleVT, Op11); | |||
| 41756 | if (N.getNumOperands() == 3) { | |||
| 41757 | LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10, N.getOperand(2)); | |||
| 41758 | RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11, N.getOperand(2)); | |||
| 41759 | } else { | |||
| 41760 | LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, Op10); | |||
| 41761 | RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, Op11); | |||
| 41762 | } | |||
| 41763 | EVT OpVT = N0.getValueType(); | |||
| 41764 | return DAG.getBitcast(ShuffleVT, | |||
| 41765 | DAG.getNode(SrcOpcode, DL, OpVT, | |||
| 41766 | DAG.getBitcast(OpVT, LHS), | |||
| 41767 | DAG.getBitcast(OpVT, RHS))); | |||
| 41768 | } | |||
| 41769 | } | |||
| 41770 | } | |||
| 41771 | break; | |||
| 41772 | } | |||
| 41773 | } | |||
| 41774 | return SDValue(); | |||
| 41775 | } | |||
| 41776 | ||||
| 41777 | /// Attempt to fold vpermf128(op(),op()) -> op(vpermf128(),vpermf128()). | |||
| 41778 | static SDValue canonicalizeLaneShuffleWithRepeatedOps(SDValue V, | |||
| 41779 | SelectionDAG &DAG, | |||
| 41780 | const SDLoc &DL) { | |||
| 41781 | assert(V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle")(static_cast <bool> (V.getOpcode() == X86ISD::VPERM2X128 && "Unknown lane shuffle") ? void (0) : __assert_fail ("V.getOpcode() == X86ISD::VPERM2X128 && \"Unknown lane shuffle\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41781, __extension__ __PRETTY_FUNCTION__)); | |||
| 41782 | ||||
| 41783 | MVT VT = V.getSimpleValueType(); | |||
| 41784 | SDValue Src0 = peekThroughBitcasts(V.getOperand(0)); | |||
| 41785 | SDValue Src1 = peekThroughBitcasts(V.getOperand(1)); | |||
| 41786 | unsigned SrcOpc0 = Src0.getOpcode(); | |||
| 41787 | unsigned SrcOpc1 = Src1.getOpcode(); | |||
| 41788 | EVT SrcVT0 = Src0.getValueType(); | |||
| 41789 | EVT SrcVT1 = Src1.getValueType(); | |||
| 41790 | ||||
| 41791 | if (!Src1.isUndef() && (SrcVT0 != SrcVT1 || SrcOpc0 != SrcOpc1)) | |||
| 41792 | return SDValue(); | |||
| 41793 | ||||
| 41794 | switch (SrcOpc0) { | |||
| 41795 | case X86ISD::MOVDDUP: { | |||
| 41796 | SDValue LHS = Src0.getOperand(0); | |||
| 41797 | SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0); | |||
| 41798 | SDValue Res = | |||
| 41799 | DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, V.getOperand(2)); | |||
| 41800 | Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res); | |||
| 41801 | return DAG.getBitcast(VT, Res); | |||
| 41802 | } | |||
| 41803 | case X86ISD::VPERMILPI: | |||
| 41804 | // TODO: Handle v4f64 permutes with different low/high lane masks. | |||
| 41805 | if (SrcVT0 == MVT::v4f64) { | |||
| 41806 | uint64_t Mask = Src0.getConstantOperandVal(1); | |||
| 41807 | if ((Mask & 0x3) != ((Mask >> 2) & 0x3)) | |||
| 41808 | break; | |||
| 41809 | } | |||
| 41810 | [[fallthrough]]; | |||
| 41811 | case X86ISD::VSHLI: | |||
| 41812 | case X86ISD::VSRLI: | |||
| 41813 | case X86ISD::VSRAI: | |||
| 41814 | case X86ISD::PSHUFD: | |||
| 41815 | if (Src1.isUndef() || Src0.getOperand(1) == Src1.getOperand(1)) { | |||
| 41816 | SDValue LHS = Src0.getOperand(0); | |||
| 41817 | SDValue RHS = Src1.isUndef() ? Src1 : Src1.getOperand(0); | |||
| 41818 | SDValue Res = DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT0, LHS, RHS, | |||
| 41819 | V.getOperand(2)); | |||
| 41820 | Res = DAG.getNode(SrcOpc0, DL, SrcVT0, Res, Src0.getOperand(1)); | |||
| 41821 | return DAG.getBitcast(VT, Res); | |||
| 41822 | } | |||
| 41823 | break; | |||
| 41824 | } | |||
| 41825 | ||||
| 41826 | return SDValue(); | |||
| 41827 | } | |||
| 41828 | ||||
| 41829 | /// Try to combine x86 target specific shuffles. | |||
| 41830 | static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, | |||
| 41831 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 41832 | const X86Subtarget &Subtarget) { | |||
| 41833 | SDLoc DL(N); | |||
| 41834 | MVT VT = N.getSimpleValueType(); | |||
| 41835 | SmallVector<int, 4> Mask; | |||
| 41836 | unsigned Opcode = N.getOpcode(); | |||
| 41837 | ||||
| 41838 | if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG)) | |||
| 41839 | return R; | |||
| 41840 | ||||
| 41841 | // Handle specific target shuffles. | |||
| 41842 | switch (Opcode) { | |||
| 41843 | case X86ISD::MOVDDUP: { | |||
| 41844 | SDValue Src = N.getOperand(0); | |||
| 41845 | // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload. | |||
| 41846 | if (VT == MVT::v2f64 && Src.hasOneUse() && | |||
| 41847 | ISD::isNormalLoad(Src.getNode())) { | |||
| 41848 | LoadSDNode *LN = cast<LoadSDNode>(Src); | |||
| 41849 | if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) { | |||
| 41850 | SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad); | |||
| 41851 | DCI.CombineTo(N.getNode(), Movddup); | |||
| 41852 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 41853 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 41854 | return N; // Return N so it doesn't get rechecked! | |||
| 41855 | } | |||
| 41856 | } | |||
| 41857 | ||||
| 41858 | return SDValue(); | |||
| 41859 | } | |||
| 41860 | case X86ISD::VBROADCAST: { | |||
| 41861 | SDValue Src = N.getOperand(0); | |||
| 41862 | SDValue BC = peekThroughBitcasts(Src); | |||
| 41863 | EVT SrcVT = Src.getValueType(); | |||
| 41864 | EVT BCVT = BC.getValueType(); | |||
| 41865 | ||||
| 41866 | // If broadcasting from another shuffle, attempt to simplify it. | |||
| 41867 | // TODO - we really need a general SimplifyDemandedVectorElts mechanism. | |||
| 41868 | if (isTargetShuffle(BC.getOpcode()) && | |||
| 41869 | VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits() == 0) { | |||
| 41870 | unsigned Scale = VT.getScalarSizeInBits() / BCVT.getScalarSizeInBits(); | |||
| 41871 | SmallVector<int, 16> DemandedMask(BCVT.getVectorNumElements(), | |||
| 41872 | SM_SentinelUndef); | |||
| 41873 | for (unsigned i = 0; i != Scale; ++i) | |||
| 41874 | DemandedMask[i] = i; | |||
| 41875 | if (SDValue Res = combineX86ShufflesRecursively( | |||
| 41876 | {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, | |||
| 41877 | X86::MaxShuffleCombineDepth, | |||
| 41878 | /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, | |||
| 41879 | /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) | |||
| 41880 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, | |||
| 41881 | DAG.getBitcast(SrcVT, Res)); | |||
| 41882 | } | |||
| 41883 | ||||
| 41884 | // broadcast(bitcast(src)) -> bitcast(broadcast(src)) | |||
| 41885 | // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward. | |||
| 41886 | if (Src.getOpcode() == ISD::BITCAST && | |||
| 41887 | SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() && | |||
| 41888 | DAG.getTargetLoweringInfo().isTypeLegal(BCVT) && | |||
| 41889 | FixedVectorType::isValidElementType( | |||
| 41890 | BCVT.getScalarType().getTypeForEVT(*DAG.getContext()))) { | |||
| 41891 | EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(), | |||
| 41892 | VT.getVectorNumElements()); | |||
| 41893 | return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); | |||
| 41894 | } | |||
| 41895 | ||||
| 41896 | // vbroadcast(bitcast(vbroadcast(src))) -> bitcast(vbroadcast(src)) | |||
| 41897 | // If we're re-broadcasting a smaller type then broadcast with that type and | |||
| 41898 | // bitcast. | |||
| 41899 | // TODO: Do this for any splat? | |||
| 41900 | if (Src.getOpcode() == ISD::BITCAST && | |||
| 41901 | (BC.getOpcode() == X86ISD::VBROADCAST || | |||
| 41902 | BC.getOpcode() == X86ISD::VBROADCAST_LOAD) && | |||
| 41903 | (VT.getScalarSizeInBits() % BCVT.getScalarSizeInBits()) == 0 && | |||
| 41904 | (VT.getSizeInBits() % BCVT.getSizeInBits()) == 0) { | |||
| 41905 | MVT NewVT = | |||
| 41906 | MVT::getVectorVT(BCVT.getSimpleVT().getScalarType(), | |||
| 41907 | VT.getSizeInBits() / BCVT.getScalarSizeInBits()); | |||
| 41908 | return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC)); | |||
| 41909 | } | |||
| 41910 | ||||
| 41911 | // Reduce broadcast source vector to lowest 128-bits. | |||
| 41912 | if (SrcVT.getSizeInBits() > 128) | |||
| 41913 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, | |||
| 41914 | extract128BitVector(Src, 0, DAG, DL)); | |||
| 41915 | ||||
| 41916 | // broadcast(scalar_to_vector(x)) -> broadcast(x). | |||
| 41917 | if (Src.getOpcode() == ISD::SCALAR_TO_VECTOR) | |||
| 41918 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); | |||
| 41919 | ||||
| 41920 | // broadcast(extract_vector_elt(x, 0)) -> broadcast(x). | |||
| 41921 | if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 41922 | isNullConstant(Src.getOperand(1)) && | |||
| 41923 | DAG.getTargetLoweringInfo().isTypeLegal( | |||
| 41924 | Src.getOperand(0).getValueType())) | |||
| 41925 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Src.getOperand(0)); | |||
| 41926 | ||||
| 41927 | // Share broadcast with the longest vector and extract low subvector (free). | |||
| 41928 | // Ensure the same SDValue from the SDNode use is being used. | |||
| 41929 | for (SDNode *User : Src->uses()) | |||
| 41930 | if (User != N.getNode() && User->getOpcode() == X86ISD::VBROADCAST && | |||
| 41931 | Src == User->getOperand(0) && | |||
| 41932 | User->getValueSizeInBits(0).getFixedValue() > | |||
| 41933 | VT.getFixedSizeInBits()) { | |||
| 41934 | return extractSubVector(SDValue(User, 0), 0, DAG, DL, | |||
| 41935 | VT.getSizeInBits()); | |||
| 41936 | } | |||
| 41937 | ||||
| 41938 | // vbroadcast(scalarload X) -> vbroadcast_load X | |||
| 41939 | // For float loads, extract other uses of the scalar from the broadcast. | |||
| 41940 | if (!SrcVT.isVector() && (Src.hasOneUse() || VT.isFloatingPoint()) && | |||
| 41941 | ISD::isNormalLoad(Src.getNode())) { | |||
| 41942 | LoadSDNode *LN = cast<LoadSDNode>(Src); | |||
| 41943 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 41944 | SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; | |||
| 41945 | SDValue BcastLd = | |||
| 41946 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, | |||
| 41947 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 41948 | // If the load value is used only by N, replace it via CombineTo N. | |||
| 41949 | bool NoReplaceExtract = Src.hasOneUse(); | |||
| 41950 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 41951 | if (NoReplaceExtract) { | |||
| 41952 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 41953 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 41954 | } else { | |||
| 41955 | SDValue Scl = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT, BcastLd, | |||
| 41956 | DAG.getIntPtrConstant(0, DL)); | |||
| 41957 | DCI.CombineTo(LN, Scl, BcastLd.getValue(1)); | |||
| 41958 | } | |||
| 41959 | return N; // Return N so it doesn't get rechecked! | |||
| 41960 | } | |||
| 41961 | ||||
| 41962 | // Due to isTypeDesirableForOp, we won't always shrink a load truncated to | |||
| 41963 | // i16. So shrink it ourselves if we can make a broadcast_load. | |||
| 41964 | if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE && | |||
| 41965 | Src.hasOneUse() && Src.getOperand(0).hasOneUse()) { | |||
| 41966 | assert(Subtarget.hasAVX2() && "Expected AVX2")(static_cast <bool> (Subtarget.hasAVX2() && "Expected AVX2" ) ? void (0) : __assert_fail ("Subtarget.hasAVX2() && \"Expected AVX2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 41966, __extension__ __PRETTY_FUNCTION__)); | |||
| 41967 | SDValue TruncIn = Src.getOperand(0); | |||
| 41968 | ||||
| 41969 | // If this is a truncate of a non extending load we can just narrow it to | |||
| 41970 | // use a broadcast_load. | |||
| 41971 | if (ISD::isNormalLoad(TruncIn.getNode())) { | |||
| 41972 | LoadSDNode *LN = cast<LoadSDNode>(TruncIn); | |||
| 41973 | // Unless its volatile or atomic. | |||
| 41974 | if (LN->isSimple()) { | |||
| 41975 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 41976 | SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; | |||
| 41977 | SDValue BcastLd = DAG.getMemIntrinsicNode( | |||
| 41978 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, | |||
| 41979 | LN->getPointerInfo(), LN->getOriginalAlign(), | |||
| 41980 | LN->getMemOperand()->getFlags()); | |||
| 41981 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 41982 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 41983 | DCI.recursivelyDeleteUnusedNodes(Src.getNode()); | |||
| 41984 | return N; // Return N so it doesn't get rechecked! | |||
| 41985 | } | |||
| 41986 | } | |||
| 41987 | ||||
| 41988 | // If this is a truncate of an i16 extload, we can directly replace it. | |||
| 41989 | if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) && | |||
| 41990 | ISD::isEXTLoad(Src.getOperand(0).getNode())) { | |||
| 41991 | LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0)); | |||
| 41992 | if (LN->getMemoryVT().getSizeInBits() == 16) { | |||
| 41993 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 41994 | SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; | |||
| 41995 | SDValue BcastLd = | |||
| 41996 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, | |||
| 41997 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 41998 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 41999 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 42000 | DCI.recursivelyDeleteUnusedNodes(Src.getNode()); | |||
| 42001 | return N; // Return N so it doesn't get rechecked! | |||
| 42002 | } | |||
| 42003 | } | |||
| 42004 | ||||
| 42005 | // If this is a truncate of load that has been shifted right, we can | |||
| 42006 | // offset the pointer and use a narrower load. | |||
| 42007 | if (TruncIn.getOpcode() == ISD::SRL && | |||
| 42008 | TruncIn.getOperand(0).hasOneUse() && | |||
| 42009 | isa<ConstantSDNode>(TruncIn.getOperand(1)) && | |||
| 42010 | ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) { | |||
| 42011 | LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0)); | |||
| 42012 | unsigned ShiftAmt = TruncIn.getConstantOperandVal(1); | |||
| 42013 | // Make sure the shift amount and the load size are divisible by 16. | |||
| 42014 | // Don't do this if the load is volatile or atomic. | |||
| 42015 | if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 && | |||
| 42016 | LN->isSimple()) { | |||
| 42017 | unsigned Offset = ShiftAmt / 8; | |||
| 42018 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 42019 | SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), | |||
| 42020 | TypeSize::Fixed(Offset), DL); | |||
| 42021 | SDValue Ops[] = { LN->getChain(), Ptr }; | |||
| 42022 | SDValue BcastLd = DAG.getMemIntrinsicNode( | |||
| 42023 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16, | |||
| 42024 | LN->getPointerInfo().getWithOffset(Offset), | |||
| 42025 | LN->getOriginalAlign(), | |||
| 42026 | LN->getMemOperand()->getFlags()); | |||
| 42027 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 42028 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 42029 | DCI.recursivelyDeleteUnusedNodes(Src.getNode()); | |||
| 42030 | return N; // Return N so it doesn't get rechecked! | |||
| 42031 | } | |||
| 42032 | } | |||
| 42033 | } | |||
| 42034 | ||||
| 42035 | // vbroadcast(vzload X) -> vbroadcast_load X | |||
| 42036 | if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) { | |||
| 42037 | MemSDNode *LN = cast<MemIntrinsicSDNode>(Src); | |||
| 42038 | if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) { | |||
| 42039 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 42040 | SDValue Ops[] = { LN->getChain(), LN->getBasePtr() }; | |||
| 42041 | SDValue BcastLd = | |||
| 42042 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, | |||
| 42043 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 42044 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 42045 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 42046 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 42047 | return N; // Return N so it doesn't get rechecked! | |||
| 42048 | } | |||
| 42049 | } | |||
| 42050 | ||||
| 42051 | // vbroadcast(vector load X) -> vbroadcast_load | |||
| 42052 | if ((SrcVT == MVT::v2f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v2i64 || | |||
| 42053 | SrcVT == MVT::v4i32) && | |||
| 42054 | Src.hasOneUse() && ISD::isNormalLoad(Src.getNode())) { | |||
| 42055 | LoadSDNode *LN = cast<LoadSDNode>(Src); | |||
| 42056 | // Unless the load is volatile or atomic. | |||
| 42057 | if (LN->isSimple()) { | |||
| 42058 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 42059 | SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; | |||
| 42060 | SDValue BcastLd = DAG.getMemIntrinsicNode( | |||
| 42061 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SrcVT.getScalarType(), | |||
| 42062 | LN->getPointerInfo(), LN->getOriginalAlign(), | |||
| 42063 | LN->getMemOperand()->getFlags()); | |||
| 42064 | DCI.CombineTo(N.getNode(), BcastLd); | |||
| 42065 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1)); | |||
| 42066 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 42067 | return N; // Return N so it doesn't get rechecked! | |||
| 42068 | } | |||
| 42069 | } | |||
| 42070 | ||||
| 42071 | return SDValue(); | |||
| 42072 | } | |||
| 42073 | case X86ISD::VZEXT_MOVL: { | |||
| 42074 | SDValue N0 = N.getOperand(0); | |||
| 42075 | ||||
| 42076 | // If this a vzmovl of a full vector load, replace it with a vzload, unless | |||
| 42077 | // the load is volatile. | |||
| 42078 | if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) { | |||
| 42079 | auto *LN = cast<LoadSDNode>(N0); | |||
| 42080 | if (SDValue VZLoad = | |||
| 42081 | narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) { | |||
| 42082 | DCI.CombineTo(N.getNode(), VZLoad); | |||
| 42083 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 42084 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 42085 | return N; | |||
| 42086 | } | |||
| 42087 | } | |||
| 42088 | ||||
| 42089 | // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast | |||
| 42090 | // and can just use a VZEXT_LOAD. | |||
| 42091 | // FIXME: Is there some way to do this with SimplifyDemandedVectorElts? | |||
| 42092 | if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) { | |||
| 42093 | auto *LN = cast<MemSDNode>(N0); | |||
| 42094 | if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) { | |||
| 42095 | SDVTList Tys = DAG.getVTList(VT, MVT::Other); | |||
| 42096 | SDValue Ops[] = {LN->getChain(), LN->getBasePtr()}; | |||
| 42097 | SDValue VZLoad = | |||
| 42098 | DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, | |||
| 42099 | LN->getMemoryVT(), LN->getMemOperand()); | |||
| 42100 | DCI.CombineTo(N.getNode(), VZLoad); | |||
| 42101 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 42102 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 42103 | return N; | |||
| 42104 | } | |||
| 42105 | } | |||
| 42106 | ||||
| 42107 | // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into | |||
| 42108 | // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X))))))) | |||
| 42109 | // if the upper bits of the i64 are zero. | |||
| 42110 | if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 42111 | N0.getOperand(0).hasOneUse() && | |||
| 42112 | N0.getOperand(0).getValueType() == MVT::i64) { | |||
| 42113 | SDValue In = N0.getOperand(0); | |||
| 42114 | APInt Mask = APInt::getHighBitsSet(64, 32); | |||
| 42115 | if (DAG.MaskedValueIsZero(In, Mask)) { | |||
| 42116 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In); | |||
| 42117 | MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); | |||
| 42118 | SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc); | |||
| 42119 | SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec); | |||
| 42120 | return DAG.getBitcast(VT, Movl); | |||
| 42121 | } | |||
| 42122 | } | |||
| 42123 | ||||
| 42124 | // Load a scalar integer constant directly to XMM instead of transferring an | |||
| 42125 | // immediate value from GPR. | |||
| 42126 | // vzext_movl (scalar_to_vector C) --> load [C,0...] | |||
| 42127 | if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) { | |||
| 42128 | if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) { | |||
| 42129 | // Create a vector constant - scalar constant followed by zeros. | |||
| 42130 | EVT ScalarVT = N0.getOperand(0).getValueType(); | |||
| 42131 | Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext()); | |||
| 42132 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 42133 | Constant *Zero = ConstantInt::getNullValue(ScalarTy); | |||
| 42134 | SmallVector<Constant *, 32> ConstantVec(NumElts, Zero); | |||
| 42135 | ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue()); | |||
| 42136 | ||||
| 42137 | // Load the vector constant from constant pool. | |||
| 42138 | MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()); | |||
| 42139 | SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT); | |||
| 42140 | MachinePointerInfo MPI = | |||
| 42141 | MachinePointerInfo::getConstantPool(DAG.getMachineFunction()); | |||
| 42142 | Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign(); | |||
| 42143 | return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment, | |||
| 42144 | MachineMemOperand::MOLoad); | |||
| 42145 | } | |||
| 42146 | } | |||
| 42147 | ||||
| 42148 | // Pull subvector inserts into undef through VZEXT_MOVL by making it an | |||
| 42149 | // insert into a zero vector. This helps get VZEXT_MOVL closer to | |||
| 42150 | // scalar_to_vectors where 256/512 are canonicalized to an insert and a | |||
| 42151 | // 128-bit scalar_to_vector. This reduces the number of isel patterns. | |||
| 42152 | if (!DCI.isBeforeLegalizeOps() && N0.hasOneUse()) { | |||
| 42153 | SDValue V = peekThroughOneUseBitcasts(N0); | |||
| 42154 | ||||
| 42155 | if (V.getOpcode() == ISD::INSERT_SUBVECTOR && V.getOperand(0).isUndef() && | |||
| 42156 | isNullConstant(V.getOperand(2))) { | |||
| 42157 | SDValue In = V.getOperand(1); | |||
| 42158 | MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), | |||
| 42159 | In.getValueSizeInBits() / | |||
| 42160 | VT.getScalarSizeInBits()); | |||
| 42161 | In = DAG.getBitcast(SubVT, In); | |||
| 42162 | SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, SubVT, In); | |||
| 42163 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, | |||
| 42164 | getZeroVector(VT, Subtarget, DAG, DL), Movl, | |||
| 42165 | V.getOperand(2)); | |||
| 42166 | } | |||
| 42167 | } | |||
| 42168 | ||||
| 42169 | return SDValue(); | |||
| 42170 | } | |||
| 42171 | case X86ISD::BLENDI: { | |||
| 42172 | SDValue N0 = N.getOperand(0); | |||
| 42173 | SDValue N1 = N.getOperand(1); | |||
| 42174 | ||||
| 42175 | // blend(bitcast(x),bitcast(y)) -> bitcast(blend(x,y)) to narrower types. | |||
| 42176 | // TODO: Handle MVT::v16i16 repeated blend mask. | |||
| 42177 | if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST && | |||
| 42178 | N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()) { | |||
| 42179 | MVT SrcVT = N0.getOperand(0).getSimpleValueType(); | |||
| 42180 | if ((VT.getScalarSizeInBits() % SrcVT.getScalarSizeInBits()) == 0 && | |||
| 42181 | SrcVT.getScalarSizeInBits() >= 32) { | |||
| 42182 | unsigned BlendMask = N.getConstantOperandVal(2); | |||
| 42183 | unsigned Size = VT.getVectorNumElements(); | |||
| 42184 | unsigned Scale = VT.getScalarSizeInBits() / SrcVT.getScalarSizeInBits(); | |||
| 42185 | BlendMask = scaleVectorShuffleBlendMask(BlendMask, Size, Scale); | |||
| 42186 | return DAG.getBitcast( | |||
| 42187 | VT, DAG.getNode(X86ISD::BLENDI, DL, SrcVT, N0.getOperand(0), | |||
| 42188 | N1.getOperand(0), | |||
| 42189 | DAG.getTargetConstant(BlendMask, DL, MVT::i8))); | |||
| 42190 | } | |||
| 42191 | } | |||
| 42192 | return SDValue(); | |||
| 42193 | } | |||
| 42194 | case X86ISD::SHUFP: { | |||
| 42195 | // Fold shufps(shuffle(x),shuffle(y)) -> shufps(x,y). | |||
| 42196 | // This is a more relaxed shuffle combiner that can ignore oneuse limits. | |||
| 42197 | // TODO: Support types other than v4f32. | |||
| 42198 | if (VT == MVT::v4f32) { | |||
| 42199 | bool Updated = false; | |||
| 42200 | SmallVector<int> Mask; | |||
| 42201 | SmallVector<SDValue> Ops; | |||
| 42202 | if (getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask) && | |||
| 42203 | Ops.size() == 2) { | |||
| 42204 | for (int i = 0; i != 2; ++i) { | |||
| 42205 | SmallVector<SDValue> SubOps; | |||
| 42206 | SmallVector<int> SubMask, SubScaledMask; | |||
| 42207 | SDValue Sub = peekThroughBitcasts(Ops[i]); | |||
| 42208 | // TODO: Scaling might be easier if we specify the demanded elts. | |||
| 42209 | if (getTargetShuffleInputs(Sub, SubOps, SubMask, DAG, 0, false) && | |||
| 42210 | scaleShuffleElements(SubMask, 4, SubScaledMask) && | |||
| 42211 | SubOps.size() == 1 && isUndefOrInRange(SubScaledMask, 0, 4)) { | |||
| 42212 | int Ofs = i * 2; | |||
| 42213 | Mask[Ofs + 0] = SubScaledMask[Mask[Ofs + 0] % 4] + (i * 4); | |||
| 42214 | Mask[Ofs + 1] = SubScaledMask[Mask[Ofs + 1] % 4] + (i * 4); | |||
| 42215 | Ops[i] = DAG.getBitcast(VT, SubOps[0]); | |||
| 42216 | Updated = true; | |||
| 42217 | } | |||
| 42218 | } | |||
| 42219 | } | |||
| 42220 | if (Updated) { | |||
| 42221 | for (int &M : Mask) | |||
| 42222 | M %= 4; | |||
| 42223 | Ops.push_back(getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); | |||
| 42224 | return DAG.getNode(X86ISD::SHUFP, DL, VT, Ops); | |||
| 42225 | } | |||
| 42226 | } | |||
| 42227 | return SDValue(); | |||
| 42228 | } | |||
| 42229 | case X86ISD::VPERMI: { | |||
| 42230 | // vpermi(bitcast(x)) -> bitcast(vpermi(x)) for same number of elements. | |||
| 42231 | // TODO: Remove when we have preferred domains in combineX86ShuffleChain. | |||
| 42232 | SDValue N0 = N.getOperand(0); | |||
| 42233 | SDValue N1 = N.getOperand(1); | |||
| 42234 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 42235 | if (N0.getOpcode() == ISD::BITCAST && | |||
| 42236 | N0.getOperand(0).getScalarValueSizeInBits() == EltSizeInBits) { | |||
| 42237 | SDValue Src = N0.getOperand(0); | |||
| 42238 | EVT SrcVT = Src.getValueType(); | |||
| 42239 | SDValue Res = DAG.getNode(X86ISD::VPERMI, DL, SrcVT, Src, N1); | |||
| 42240 | return DAG.getBitcast(VT, Res); | |||
| 42241 | } | |||
| 42242 | return SDValue(); | |||
| 42243 | } | |||
| 42244 | case X86ISD::VPERM2X128: { | |||
| 42245 | // Fold vperm2x128(bitcast(x),bitcast(y),c) -> bitcast(vperm2x128(x,y,c)). | |||
| 42246 | SDValue LHS = N->getOperand(0); | |||
| 42247 | SDValue RHS = N->getOperand(1); | |||
| 42248 | if (LHS.getOpcode() == ISD::BITCAST && | |||
| 42249 | (RHS.getOpcode() == ISD::BITCAST || RHS.isUndef())) { | |||
| 42250 | EVT SrcVT = LHS.getOperand(0).getValueType(); | |||
| 42251 | if (RHS.isUndef() || SrcVT == RHS.getOperand(0).getValueType()) { | |||
| 42252 | return DAG.getBitcast(VT, DAG.getNode(X86ISD::VPERM2X128, DL, SrcVT, | |||
| 42253 | DAG.getBitcast(SrcVT, LHS), | |||
| 42254 | DAG.getBitcast(SrcVT, RHS), | |||
| 42255 | N->getOperand(2))); | |||
| 42256 | } | |||
| 42257 | } | |||
| 42258 | ||||
| 42259 | // Fold vperm2x128(op(),op()) -> op(vperm2x128(),vperm2x128()). | |||
| 42260 | if (SDValue Res = canonicalizeLaneShuffleWithRepeatedOps(N, DAG, DL)) | |||
| 42261 | return Res; | |||
| 42262 | ||||
| 42263 | // Fold vperm2x128 subvector shuffle with an inner concat pattern. | |||
| 42264 | // vperm2x128(concat(X,Y),concat(Z,W)) --> concat X,Y etc. | |||
| 42265 | auto FindSubVector128 = [&](unsigned Idx) { | |||
| 42266 | if (Idx > 3) | |||
| 42267 | return SDValue(); | |||
| 42268 | SDValue Src = peekThroughBitcasts(N.getOperand(Idx < 2 ? 0 : 1)); | |||
| 42269 | SmallVector<SDValue> SubOps; | |||
| 42270 | if (collectConcatOps(Src.getNode(), SubOps, DAG) && SubOps.size() == 2) | |||
| 42271 | return SubOps[Idx & 1]; | |||
| 42272 | unsigned NumElts = Src.getValueType().getVectorNumElements(); | |||
| 42273 | if ((Idx & 1) == 1 && Src.getOpcode() == ISD::INSERT_SUBVECTOR && | |||
| 42274 | Src.getOperand(1).getValueSizeInBits() == 128 && | |||
| 42275 | Src.getConstantOperandAPInt(2) == (NumElts / 2)) { | |||
| 42276 | return Src.getOperand(1); | |||
| 42277 | } | |||
| 42278 | return SDValue(); | |||
| 42279 | }; | |||
| 42280 | unsigned Imm = N.getConstantOperandVal(2); | |||
| 42281 | if (SDValue SubLo = FindSubVector128(Imm & 0x0F)) { | |||
| 42282 | if (SDValue SubHi = FindSubVector128((Imm & 0xF0) >> 4)) { | |||
| 42283 | MVT SubVT = VT.getHalfNumVectorElementsVT(); | |||
| 42284 | SubLo = DAG.getBitcast(SubVT, SubLo); | |||
| 42285 | SubHi = DAG.getBitcast(SubVT, SubHi); | |||
| 42286 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, SubLo, SubHi); | |||
| 42287 | } | |||
| 42288 | } | |||
| 42289 | return SDValue(); | |||
| 42290 | } | |||
| 42291 | case X86ISD::PSHUFD: | |||
| 42292 | case X86ISD::PSHUFLW: | |||
| 42293 | case X86ISD::PSHUFHW: { | |||
| 42294 | SDValue N0 = N.getOperand(0); | |||
| 42295 | SDValue N1 = N.getOperand(1); | |||
| 42296 | if (N0->hasOneUse()) { | |||
| 42297 | SDValue V = peekThroughOneUseBitcasts(N0); | |||
| 42298 | switch (V.getOpcode()) { | |||
| 42299 | case X86ISD::VSHL: | |||
| 42300 | case X86ISD::VSRL: | |||
| 42301 | case X86ISD::VSRA: | |||
| 42302 | case X86ISD::VSHLI: | |||
| 42303 | case X86ISD::VSRLI: | |||
| 42304 | case X86ISD::VSRAI: | |||
| 42305 | case X86ISD::VROTLI: | |||
| 42306 | case X86ISD::VROTRI: { | |||
| 42307 | MVT InnerVT = V.getSimpleValueType(); | |||
| 42308 | if (InnerVT.getScalarSizeInBits() <= VT.getScalarSizeInBits()) { | |||
| 42309 | SDValue Res = DAG.getNode(Opcode, DL, VT, | |||
| 42310 | DAG.getBitcast(VT, V.getOperand(0)), N1); | |||
| 42311 | Res = DAG.getBitcast(InnerVT, Res); | |||
| 42312 | Res = DAG.getNode(V.getOpcode(), DL, InnerVT, Res, V.getOperand(1)); | |||
| 42313 | return DAG.getBitcast(VT, Res); | |||
| 42314 | } | |||
| 42315 | break; | |||
| 42316 | } | |||
| 42317 | } | |||
| 42318 | } | |||
| 42319 | ||||
| 42320 | Mask = getPSHUFShuffleMask(N); | |||
| 42321 | assert(Mask.size() == 4)(static_cast <bool> (Mask.size() == 4) ? void (0) : __assert_fail ("Mask.size() == 4", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 42321, __extension__ __PRETTY_FUNCTION__)); | |||
| 42322 | break; | |||
| 42323 | } | |||
| 42324 | case X86ISD::MOVSD: | |||
| 42325 | case X86ISD::MOVSH: | |||
| 42326 | case X86ISD::MOVSS: { | |||
| 42327 | SDValue N0 = N.getOperand(0); | |||
| 42328 | SDValue N1 = N.getOperand(1); | |||
| 42329 | ||||
| 42330 | // Canonicalize scalar FPOps: | |||
| 42331 | // MOVS*(N0, OP(N0, N1)) --> MOVS*(N0, SCALAR_TO_VECTOR(OP(N0[0], N1[0]))) | |||
| 42332 | // If commutable, allow OP(N1[0], N0[0]). | |||
| 42333 | unsigned Opcode1 = N1.getOpcode(); | |||
| 42334 | if (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL || Opcode1 == ISD::FSUB || | |||
| 42335 | Opcode1 == ISD::FDIV) { | |||
| 42336 | SDValue N10 = N1.getOperand(0); | |||
| 42337 | SDValue N11 = N1.getOperand(1); | |||
| 42338 | if (N10 == N0 || | |||
| 42339 | (N11 == N0 && (Opcode1 == ISD::FADD || Opcode1 == ISD::FMUL))) { | |||
| 42340 | if (N10 != N0) | |||
| 42341 | std::swap(N10, N11); | |||
| 42342 | MVT SVT = VT.getVectorElementType(); | |||
| 42343 | SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL); | |||
| 42344 | N10 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N10, ZeroIdx); | |||
| 42345 | N11 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SVT, N11, ZeroIdx); | |||
| 42346 | SDValue Scl = DAG.getNode(Opcode1, DL, SVT, N10, N11); | |||
| 42347 | SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); | |||
| 42348 | return DAG.getNode(Opcode, DL, VT, N0, SclVec); | |||
| 42349 | } | |||
| 42350 | } | |||
| 42351 | ||||
| 42352 | return SDValue(); | |||
| 42353 | } | |||
| 42354 | case X86ISD::INSERTPS: { | |||
| 42355 | assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32")(static_cast <bool> (VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32" ) ? void (0) : __assert_fail ("VT == MVT::v4f32 && \"INSERTPS ValueType must be MVT::v4f32\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42355, __extension__ __PRETTY_FUNCTION__)); | |||
| 42356 | SDValue Op0 = N.getOperand(0); | |||
| 42357 | SDValue Op1 = N.getOperand(1); | |||
| 42358 | unsigned InsertPSMask = N.getConstantOperandVal(2); | |||
| 42359 | unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; | |||
| 42360 | unsigned DstIdx = (InsertPSMask >> 4) & 0x3; | |||
| 42361 | unsigned ZeroMask = InsertPSMask & 0xF; | |||
| 42362 | ||||
| 42363 | // If we zero out all elements from Op0 then we don't need to reference it. | |||
| 42364 | if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef()) | |||
| 42365 | return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1, | |||
| 42366 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 42367 | ||||
| 42368 | // If we zero out the element from Op1 then we don't need to reference it. | |||
| 42369 | if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef()) | |||
| 42370 | return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), | |||
| 42371 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 42372 | ||||
| 42373 | // Attempt to merge insertps Op1 with an inner target shuffle node. | |||
| 42374 | SmallVector<int, 8> TargetMask1; | |||
| 42375 | SmallVector<SDValue, 2> Ops1; | |||
| 42376 | APInt KnownUndef1, KnownZero1; | |||
| 42377 | if (getTargetShuffleAndZeroables(Op1, TargetMask1, Ops1, KnownUndef1, | |||
| 42378 | KnownZero1)) { | |||
| 42379 | if (KnownUndef1[SrcIdx] || KnownZero1[SrcIdx]) { | |||
| 42380 | // Zero/UNDEF insertion - zero out element and remove dependency. | |||
| 42381 | InsertPSMask |= (1u << DstIdx); | |||
| 42382 | return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), | |||
| 42383 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 42384 | } | |||
| 42385 | // Update insertps mask srcidx and reference the source input directly. | |||
| 42386 | int M = TargetMask1[SrcIdx]; | |||
| 42387 | assert(0 <= M && M < 8 && "Shuffle index out of range")(static_cast <bool> (0 <= M && M < 8 && "Shuffle index out of range") ? void (0) : __assert_fail ("0 <= M && M < 8 && \"Shuffle index out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42387, __extension__ __PRETTY_FUNCTION__)); | |||
| 42388 | InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); | |||
| 42389 | Op1 = Ops1[M < 4 ? 0 : 1]; | |||
| 42390 | return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, | |||
| 42391 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 42392 | } | |||
| 42393 | ||||
| 42394 | // Attempt to merge insertps Op0 with an inner target shuffle node. | |||
| 42395 | SmallVector<int, 8> TargetMask0; | |||
| 42396 | SmallVector<SDValue, 2> Ops0; | |||
| 42397 | APInt KnownUndef0, KnownZero0; | |||
| 42398 | if (getTargetShuffleAndZeroables(Op0, TargetMask0, Ops0, KnownUndef0, | |||
| 42399 | KnownZero0)) { | |||
| 42400 | bool Updated = false; | |||
| 42401 | bool UseInput00 = false; | |||
| 42402 | bool UseInput01 = false; | |||
| 42403 | for (int i = 0; i != 4; ++i) { | |||
| 42404 | if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { | |||
| 42405 | // No change if element is already zero or the inserted element. | |||
| 42406 | continue; | |||
| 42407 | } | |||
| 42408 | ||||
| 42409 | if (KnownUndef0[i] || KnownZero0[i]) { | |||
| 42410 | // If the target mask is undef/zero then we must zero the element. | |||
| 42411 | InsertPSMask |= (1u << i); | |||
| 42412 | Updated = true; | |||
| 42413 | continue; | |||
| 42414 | } | |||
| 42415 | ||||
| 42416 | // The input vector element must be inline. | |||
| 42417 | int M = TargetMask0[i]; | |||
| 42418 | if (M != i && M != (i + 4)) | |||
| 42419 | return SDValue(); | |||
| 42420 | ||||
| 42421 | // Determine which inputs of the target shuffle we're using. | |||
| 42422 | UseInput00 |= (0 <= M && M < 4); | |||
| 42423 | UseInput01 |= (4 <= M); | |||
| 42424 | } | |||
| 42425 | ||||
| 42426 | // If we're not using both inputs of the target shuffle then use the | |||
| 42427 | // referenced input directly. | |||
| 42428 | if (UseInput00 && !UseInput01) { | |||
| 42429 | Updated = true; | |||
| 42430 | Op0 = Ops0[0]; | |||
| 42431 | } else if (!UseInput00 && UseInput01) { | |||
| 42432 | Updated = true; | |||
| 42433 | Op0 = Ops0[1]; | |||
| 42434 | } | |||
| 42435 | ||||
| 42436 | if (Updated) | |||
| 42437 | return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, | |||
| 42438 | DAG.getTargetConstant(InsertPSMask, DL, MVT::i8)); | |||
| 42439 | } | |||
| 42440 | ||||
| 42441 | // If we're inserting an element from a vbroadcast load, fold the | |||
| 42442 | // load into the X86insertps instruction. We need to convert the scalar | |||
| 42443 | // load to a vector and clear the source lane of the INSERTPS control. | |||
| 42444 | if (Op1.getOpcode() == X86ISD::VBROADCAST_LOAD && Op1.hasOneUse()) { | |||
| 42445 | auto *MemIntr = cast<MemIntrinsicSDNode>(Op1); | |||
| 42446 | if (MemIntr->getMemoryVT().getScalarSizeInBits() == 32) { | |||
| 42447 | SDValue Load = DAG.getLoad(MVT::f32, DL, MemIntr->getChain(), | |||
| 42448 | MemIntr->getBasePtr(), | |||
| 42449 | MemIntr->getMemOperand()); | |||
| 42450 | SDValue Insert = DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, | |||
| 42451 | DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, | |||
| 42452 | Load), | |||
| 42453 | DAG.getTargetConstant(InsertPSMask & 0x3f, DL, MVT::i8)); | |||
| 42454 | DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); | |||
| 42455 | return Insert; | |||
| 42456 | } | |||
| 42457 | } | |||
| 42458 | ||||
| 42459 | return SDValue(); | |||
| 42460 | } | |||
| 42461 | default: | |||
| 42462 | return SDValue(); | |||
| 42463 | } | |||
| 42464 | ||||
| 42465 | // Nuke no-op shuffles that show up after combining. | |||
| 42466 | if (isNoopShuffleMask(Mask)) | |||
| 42467 | return N.getOperand(0); | |||
| 42468 | ||||
| 42469 | // Look for simplifications involving one or two shuffle instructions. | |||
| 42470 | SDValue V = N.getOperand(0); | |||
| 42471 | switch (N.getOpcode()) { | |||
| 42472 | default: | |||
| 42473 | break; | |||
| 42474 | case X86ISD::PSHUFLW: | |||
| 42475 | case X86ISD::PSHUFHW: | |||
| 42476 | assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!")(static_cast <bool> (VT.getVectorElementType() == MVT:: i16 && "Bad word shuffle type!") ? void (0) : __assert_fail ("VT.getVectorElementType() == MVT::i16 && \"Bad word shuffle type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42476, __extension__ __PRETTY_FUNCTION__)); | |||
| 42477 | ||||
| 42478 | // See if this reduces to a PSHUFD which is no more expensive and can | |||
| 42479 | // combine with more operations. Note that it has to at least flip the | |||
| 42480 | // dwords as otherwise it would have been removed as a no-op. | |||
| 42481 | if (ArrayRef(Mask).equals({2, 3, 0, 1})) { | |||
| 42482 | int DMask[] = {0, 1, 2, 3}; | |||
| 42483 | int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2; | |||
| 42484 | DMask[DOffset + 0] = DOffset + 1; | |||
| 42485 | DMask[DOffset + 1] = DOffset + 0; | |||
| 42486 | MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); | |||
| 42487 | V = DAG.getBitcast(DVT, V); | |||
| 42488 | V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V, | |||
| 42489 | getV4X86ShuffleImm8ForMask(DMask, DL, DAG)); | |||
| 42490 | return DAG.getBitcast(VT, V); | |||
| 42491 | } | |||
| 42492 | ||||
| 42493 | // Look for shuffle patterns which can be implemented as a single unpack. | |||
| 42494 | // FIXME: This doesn't handle the location of the PSHUFD generically, and | |||
| 42495 | // only works when we have a PSHUFD followed by two half-shuffles. | |||
| 42496 | if (Mask[0] == Mask[1] && Mask[2] == Mask[3] && | |||
| 42497 | (V.getOpcode() == X86ISD::PSHUFLW || | |||
| 42498 | V.getOpcode() == X86ISD::PSHUFHW) && | |||
| 42499 | V.getOpcode() != N.getOpcode() && | |||
| 42500 | V.hasOneUse() && V.getOperand(0).hasOneUse()) { | |||
| 42501 | SDValue D = peekThroughOneUseBitcasts(V.getOperand(0)); | |||
| 42502 | if (D.getOpcode() == X86ISD::PSHUFD) { | |||
| 42503 | SmallVector<int, 4> VMask = getPSHUFShuffleMask(V); | |||
| 42504 | SmallVector<int, 4> DMask = getPSHUFShuffleMask(D); | |||
| 42505 | int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; | |||
| 42506 | int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4; | |||
| 42507 | int WordMask[8]; | |||
| 42508 | for (int i = 0; i < 4; ++i) { | |||
| 42509 | WordMask[i + NOffset] = Mask[i] + NOffset; | |||
| 42510 | WordMask[i + VOffset] = VMask[i] + VOffset; | |||
| 42511 | } | |||
| 42512 | // Map the word mask through the DWord mask. | |||
| 42513 | int MappedMask[8]; | |||
| 42514 | for (int i = 0; i < 8; ++i) | |||
| 42515 | MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2; | |||
| 42516 | if (ArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) || | |||
| 42517 | ArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) { | |||
| 42518 | // We can replace all three shuffles with an unpack. | |||
| 42519 | V = DAG.getBitcast(VT, D.getOperand(0)); | |||
| 42520 | return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL | |||
| 42521 | : X86ISD::UNPCKH, | |||
| 42522 | DL, VT, V, V); | |||
| 42523 | } | |||
| 42524 | } | |||
| 42525 | } | |||
| 42526 | ||||
| 42527 | break; | |||
| 42528 | ||||
| 42529 | case X86ISD::PSHUFD: | |||
| 42530 | if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG)) | |||
| 42531 | return NewN; | |||
| 42532 | ||||
| 42533 | break; | |||
| 42534 | } | |||
| 42535 | ||||
| 42536 | return SDValue(); | |||
| 42537 | } | |||
| 42538 | ||||
| 42539 | /// Checks if the shuffle mask takes subsequent elements | |||
| 42540 | /// alternately from two vectors. | |||
| 42541 | /// For example <0, 5, 2, 7> or <8, 1, 10, 3, 12, 5, 14, 7> are both correct. | |||
| 42542 | static bool isAddSubOrSubAddMask(ArrayRef<int> Mask, bool &Op0Even) { | |||
| 42543 | ||||
| 42544 | int ParitySrc[2] = {-1, -1}; | |||
| 42545 | unsigned Size = Mask.size(); | |||
| 42546 | for (unsigned i = 0; i != Size; ++i) { | |||
| 42547 | int M = Mask[i]; | |||
| 42548 | if (M < 0) | |||
| 42549 | continue; | |||
| 42550 | ||||
| 42551 | // Make sure we are using the matching element from the input. | |||
| 42552 | if ((M % Size) != i) | |||
| 42553 | return false; | |||
| 42554 | ||||
| 42555 | // Make sure we use the same input for all elements of the same parity. | |||
| 42556 | int Src = M / Size; | |||
| 42557 | if (ParitySrc[i % 2] >= 0 && ParitySrc[i % 2] != Src) | |||
| 42558 | return false; | |||
| 42559 | ParitySrc[i % 2] = Src; | |||
| 42560 | } | |||
| 42561 | ||||
| 42562 | // Make sure each input is used. | |||
| 42563 | if (ParitySrc[0] < 0 || ParitySrc[1] < 0 || ParitySrc[0] == ParitySrc[1]) | |||
| 42564 | return false; | |||
| 42565 | ||||
| 42566 | Op0Even = ParitySrc[0] == 0; | |||
| 42567 | return true; | |||
| 42568 | } | |||
| 42569 | ||||
| 42570 | /// Returns true iff the shuffle node \p N can be replaced with ADDSUB(SUBADD) | |||
| 42571 | /// operation. If true is returned then the operands of ADDSUB(SUBADD) operation | |||
| 42572 | /// are written to the parameters \p Opnd0 and \p Opnd1. | |||
| 42573 | /// | |||
| 42574 | /// We combine shuffle to ADDSUB(SUBADD) directly on the abstract vector shuffle nodes | |||
| 42575 | /// so it is easier to generically match. We also insert dummy vector shuffle | |||
| 42576 | /// nodes for the operands which explicitly discard the lanes which are unused | |||
| 42577 | /// by this operation to try to flow through the rest of the combiner | |||
| 42578 | /// the fact that they're unused. | |||
| 42579 | static bool isAddSubOrSubAdd(SDNode *N, const X86Subtarget &Subtarget, | |||
| 42580 | SelectionDAG &DAG, SDValue &Opnd0, SDValue &Opnd1, | |||
| 42581 | bool &IsSubAdd) { | |||
| 42582 | ||||
| 42583 | EVT VT = N->getValueType(0); | |||
| 42584 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 42585 | if (!Subtarget.hasSSE3() || !TLI.isTypeLegal(VT) || | |||
| 42586 | !VT.getSimpleVT().isFloatingPoint()) | |||
| 42587 | return false; | |||
| 42588 | ||||
| 42589 | // We only handle target-independent shuffles. | |||
| 42590 | // FIXME: It would be easy and harmless to use the target shuffle mask | |||
| 42591 | // extraction tool to support more. | |||
| 42592 | if (N->getOpcode() != ISD::VECTOR_SHUFFLE) | |||
| 42593 | return false; | |||
| 42594 | ||||
| 42595 | SDValue V1 = N->getOperand(0); | |||
| 42596 | SDValue V2 = N->getOperand(1); | |||
| 42597 | ||||
| 42598 | // Make sure we have an FADD and an FSUB. | |||
| 42599 | if ((V1.getOpcode() != ISD::FADD && V1.getOpcode() != ISD::FSUB) || | |||
| 42600 | (V2.getOpcode() != ISD::FADD && V2.getOpcode() != ISD::FSUB) || | |||
| 42601 | V1.getOpcode() == V2.getOpcode()) | |||
| 42602 | return false; | |||
| 42603 | ||||
| 42604 | // If there are other uses of these operations we can't fold them. | |||
| 42605 | if (!V1->hasOneUse() || !V2->hasOneUse()) | |||
| 42606 | return false; | |||
| 42607 | ||||
| 42608 | // Ensure that both operations have the same operands. Note that we can | |||
| 42609 | // commute the FADD operands. | |||
| 42610 | SDValue LHS, RHS; | |||
| 42611 | if (V1.getOpcode() == ISD::FSUB) { | |||
| 42612 | LHS = V1->getOperand(0); RHS = V1->getOperand(1); | |||
| 42613 | if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) && | |||
| 42614 | (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS)) | |||
| 42615 | return false; | |||
| 42616 | } else { | |||
| 42617 | assert(V2.getOpcode() == ISD::FSUB && "Unexpected opcode")(static_cast <bool> (V2.getOpcode() == ISD::FSUB && "Unexpected opcode") ? void (0) : __assert_fail ("V2.getOpcode() == ISD::FSUB && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42617, __extension__ __PRETTY_FUNCTION__)); | |||
| 42618 | LHS = V2->getOperand(0); RHS = V2->getOperand(1); | |||
| 42619 | if ((V1->getOperand(0) != LHS || V1->getOperand(1) != RHS) && | |||
| 42620 | (V1->getOperand(0) != RHS || V1->getOperand(1) != LHS)) | |||
| 42621 | return false; | |||
| 42622 | } | |||
| 42623 | ||||
| 42624 | ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); | |||
| 42625 | bool Op0Even; | |||
| 42626 | if (!isAddSubOrSubAddMask(Mask, Op0Even)) | |||
| 42627 | return false; | |||
| 42628 | ||||
| 42629 | // It's a subadd if the vector in the even parity is an FADD. | |||
| 42630 | IsSubAdd = Op0Even ? V1->getOpcode() == ISD::FADD | |||
| 42631 | : V2->getOpcode() == ISD::FADD; | |||
| 42632 | ||||
| 42633 | Opnd0 = LHS; | |||
| 42634 | Opnd1 = RHS; | |||
| 42635 | return true; | |||
| 42636 | } | |||
| 42637 | ||||
| 42638 | /// Combine shuffle of two fma nodes into FMAddSub or FMSubAdd. | |||
| 42639 | static SDValue combineShuffleToFMAddSub(SDNode *N, | |||
| 42640 | const X86Subtarget &Subtarget, | |||
| 42641 | SelectionDAG &DAG) { | |||
| 42642 | // We only handle target-independent shuffles. | |||
| 42643 | // FIXME: It would be easy and harmless to use the target shuffle mask | |||
| 42644 | // extraction tool to support more. | |||
| 42645 | if (N->getOpcode() != ISD::VECTOR_SHUFFLE) | |||
| 42646 | return SDValue(); | |||
| 42647 | ||||
| 42648 | MVT VT = N->getSimpleValueType(0); | |||
| 42649 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 42650 | if (!Subtarget.hasAnyFMA() || !TLI.isTypeLegal(VT)) | |||
| 42651 | return SDValue(); | |||
| 42652 | ||||
| 42653 | // We're trying to match (shuffle fma(a, b, c), X86Fmsub(a, b, c). | |||
| 42654 | SDValue Op0 = N->getOperand(0); | |||
| 42655 | SDValue Op1 = N->getOperand(1); | |||
| 42656 | SDValue FMAdd = Op0, FMSub = Op1; | |||
| 42657 | if (FMSub.getOpcode() != X86ISD::FMSUB) | |||
| 42658 | std::swap(FMAdd, FMSub); | |||
| 42659 | ||||
| 42660 | if (FMAdd.getOpcode() != ISD::FMA || FMSub.getOpcode() != X86ISD::FMSUB || | |||
| 42661 | FMAdd.getOperand(0) != FMSub.getOperand(0) || !FMAdd.hasOneUse() || | |||
| 42662 | FMAdd.getOperand(1) != FMSub.getOperand(1) || !FMSub.hasOneUse() || | |||
| 42663 | FMAdd.getOperand(2) != FMSub.getOperand(2)) | |||
| 42664 | return SDValue(); | |||
| 42665 | ||||
| 42666 | // Check for correct shuffle mask. | |||
| 42667 | ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); | |||
| 42668 | bool Op0Even; | |||
| 42669 | if (!isAddSubOrSubAddMask(Mask, Op0Even)) | |||
| 42670 | return SDValue(); | |||
| 42671 | ||||
| 42672 | // FMAddSub takes zeroth operand from FMSub node. | |||
| 42673 | SDLoc DL(N); | |||
| 42674 | bool IsSubAdd = Op0Even ? Op0 == FMAdd : Op1 == FMAdd; | |||
| 42675 | unsigned Opcode = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; | |||
| 42676 | return DAG.getNode(Opcode, DL, VT, FMAdd.getOperand(0), FMAdd.getOperand(1), | |||
| 42677 | FMAdd.getOperand(2)); | |||
| 42678 | } | |||
| 42679 | ||||
| 42680 | /// Try to combine a shuffle into a target-specific add-sub or | |||
| 42681 | /// mul-add-sub node. | |||
| 42682 | static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, | |||
| 42683 | const X86Subtarget &Subtarget, | |||
| 42684 | SelectionDAG &DAG) { | |||
| 42685 | if (SDValue V = combineShuffleToFMAddSub(N, Subtarget, DAG)) | |||
| 42686 | return V; | |||
| 42687 | ||||
| 42688 | SDValue Opnd0, Opnd1; | |||
| 42689 | bool IsSubAdd; | |||
| 42690 | if (!isAddSubOrSubAdd(N, Subtarget, DAG, Opnd0, Opnd1, IsSubAdd)) | |||
| 42691 | return SDValue(); | |||
| 42692 | ||||
| 42693 | MVT VT = N->getSimpleValueType(0); | |||
| 42694 | SDLoc DL(N); | |||
| 42695 | ||||
| 42696 | // Try to generate X86ISD::FMADDSUB node here. | |||
| 42697 | SDValue Opnd2; | |||
| 42698 | if (isFMAddSubOrFMSubAdd(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) { | |||
| 42699 | unsigned Opc = IsSubAdd ? X86ISD::FMSUBADD : X86ISD::FMADDSUB; | |||
| 42700 | return DAG.getNode(Opc, DL, VT, Opnd0, Opnd1, Opnd2); | |||
| 42701 | } | |||
| 42702 | ||||
| 42703 | if (IsSubAdd) | |||
| 42704 | return SDValue(); | |||
| 42705 | ||||
| 42706 | // Do not generate X86ISD::ADDSUB node for 512-bit types even though | |||
| 42707 | // the ADDSUB idiom has been successfully recognized. There are no known | |||
| 42708 | // X86 targets with 512-bit ADDSUB instructions! | |||
| 42709 | if (VT.is512BitVector()) | |||
| 42710 | return SDValue(); | |||
| 42711 | ||||
| 42712 | // Do not generate X86ISD::ADDSUB node for FP16's vector types even though | |||
| 42713 | // the ADDSUB idiom has been successfully recognized. There are no known | |||
| 42714 | // X86 targets with FP16 ADDSUB instructions! | |||
| 42715 | if (VT.getVectorElementType() == MVT::f16) | |||
| 42716 | return SDValue(); | |||
| 42717 | ||||
| 42718 | return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1); | |||
| 42719 | } | |||
| 42720 | ||||
| 42721 | // We are looking for a shuffle where both sources are concatenated with undef | |||
| 42722 | // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so | |||
| 42723 | // if we can express this as a single-source shuffle, that's preferable. | |||
| 42724 | static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, | |||
| 42725 | const X86Subtarget &Subtarget) { | |||
| 42726 | if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N)) | |||
| 42727 | return SDValue(); | |||
| 42728 | ||||
| 42729 | EVT VT = N->getValueType(0); | |||
| 42730 | ||||
| 42731 | // We only care about shuffles of 128/256-bit vectors of 32/64-bit values. | |||
| 42732 | if (!VT.is128BitVector() && !VT.is256BitVector()) | |||
| 42733 | return SDValue(); | |||
| 42734 | ||||
| 42735 | if (VT.getVectorElementType() != MVT::i32 && | |||
| 42736 | VT.getVectorElementType() != MVT::i64 && | |||
| 42737 | VT.getVectorElementType() != MVT::f32 && | |||
| 42738 | VT.getVectorElementType() != MVT::f64) | |||
| 42739 | return SDValue(); | |||
| 42740 | ||||
| 42741 | SDValue N0 = N->getOperand(0); | |||
| 42742 | SDValue N1 = N->getOperand(1); | |||
| 42743 | ||||
| 42744 | // Check that both sources are concats with undef. | |||
| 42745 | if (N0.getOpcode() != ISD::CONCAT_VECTORS || | |||
| 42746 | N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 || | |||
| 42747 | N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() || | |||
| 42748 | !N1.getOperand(1).isUndef()) | |||
| 42749 | return SDValue(); | |||
| 42750 | ||||
| 42751 | // Construct the new shuffle mask. Elements from the first source retain their | |||
| 42752 | // index, but elements from the second source no longer need to skip an undef. | |||
| 42753 | SmallVector<int, 8> Mask; | |||
| 42754 | int NumElts = VT.getVectorNumElements(); | |||
| 42755 | ||||
| 42756 | ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N); | |||
| 42757 | for (int Elt : SVOp->getMask()) | |||
| 42758 | Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2)); | |||
| 42759 | ||||
| 42760 | SDLoc DL(N); | |||
| 42761 | SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0), | |||
| 42762 | N1.getOperand(0)); | |||
| 42763 | return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); | |||
| 42764 | } | |||
| 42765 | ||||
| 42766 | /// If we have a shuffle of AVX/AVX512 (256/512 bit) vectors that only uses the | |||
| 42767 | /// low half of each source vector and does not set any high half elements in | |||
| 42768 | /// the destination vector, narrow the shuffle to half its original size. | |||
| 42769 | static SDValue narrowShuffle(ShuffleVectorSDNode *Shuf, SelectionDAG &DAG) { | |||
| 42770 | if (!Shuf->getValueType(0).isSimple()) | |||
| 42771 | return SDValue(); | |||
| 42772 | MVT VT = Shuf->getSimpleValueType(0); | |||
| 42773 | if (!VT.is256BitVector() && !VT.is512BitVector()) | |||
| 42774 | return SDValue(); | |||
| 42775 | ||||
| 42776 | // See if we can ignore all of the high elements of the shuffle. | |||
| 42777 | ArrayRef<int> Mask = Shuf->getMask(); | |||
| 42778 | if (!isUndefUpperHalf(Mask)) | |||
| 42779 | return SDValue(); | |||
| 42780 | ||||
| 42781 | // Check if the shuffle mask accesses only the low half of each input vector | |||
| 42782 | // (half-index output is 0 or 2). | |||
| 42783 | int HalfIdx1, HalfIdx2; | |||
| 42784 | SmallVector<int, 8> HalfMask(Mask.size() / 2); | |||
| 42785 | if (!getHalfShuffleMask(Mask, HalfMask, HalfIdx1, HalfIdx2) || | |||
| 42786 | (HalfIdx1 % 2 == 1) || (HalfIdx2 % 2 == 1)) | |||
| 42787 | return SDValue(); | |||
| 42788 | ||||
| 42789 | // Create a half-width shuffle to replace the unnecessarily wide shuffle. | |||
| 42790 | // The trick is knowing that all of the insert/extract are actually free | |||
| 42791 | // subregister (zmm<->ymm or ymm<->xmm) ops. That leaves us with a shuffle | |||
| 42792 | // of narrow inputs into a narrow output, and that is always cheaper than | |||
| 42793 | // the wide shuffle that we started with. | |||
| 42794 | return getShuffleHalfVectors(SDLoc(Shuf), Shuf->getOperand(0), | |||
| 42795 | Shuf->getOperand(1), HalfMask, HalfIdx1, | |||
| 42796 | HalfIdx2, false, DAG, /*UseConcat*/true); | |||
| 42797 | } | |||
| 42798 | ||||
| 42799 | static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, | |||
| 42800 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 42801 | const X86Subtarget &Subtarget) { | |||
| 42802 | if (auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N)) | |||
| 42803 | if (SDValue V = narrowShuffle(Shuf, DAG)) | |||
| 42804 | return V; | |||
| 42805 | ||||
| 42806 | // If we have legalized the vector types, look for blends of FADD and FSUB | |||
| 42807 | // nodes that we can fuse into an ADDSUB, FMADDSUB, or FMSUBADD node. | |||
| 42808 | SDLoc dl(N); | |||
| 42809 | EVT VT = N->getValueType(0); | |||
| 42810 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 42811 | if (TLI.isTypeLegal(VT)) | |||
| 42812 | if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) | |||
| 42813 | return AddSub; | |||
| 42814 | ||||
| 42815 | // Attempt to combine into a vector load/broadcast. | |||
| 42816 | if (SDValue LD = combineToConsecutiveLoads( | |||
| 42817 | VT, SDValue(N, 0), dl, DAG, Subtarget, /*IsAfterLegalize*/ true)) | |||
| 42818 | return LD; | |||
| 42819 | ||||
| 42820 | // For AVX2, we sometimes want to combine | |||
| 42821 | // (vector_shuffle <mask> (concat_vectors t1, undef) | |||
| 42822 | // (concat_vectors t2, undef)) | |||
| 42823 | // Into: | |||
| 42824 | // (vector_shuffle <mask> (concat_vectors t1, t2), undef) | |||
| 42825 | // Since the latter can be efficiently lowered with VPERMD/VPERMQ | |||
| 42826 | if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget)) | |||
| 42827 | return ShufConcat; | |||
| 42828 | ||||
| 42829 | if (isTargetShuffle(N->getOpcode())) { | |||
| 42830 | SDValue Op(N, 0); | |||
| 42831 | if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget)) | |||
| 42832 | return Shuffle; | |||
| 42833 | ||||
| 42834 | // Try recursively combining arbitrary sequences of x86 shuffle | |||
| 42835 | // instructions into higher-order shuffles. We do this after combining | |||
| 42836 | // specific PSHUF instruction sequences into their minimal form so that we | |||
| 42837 | // can evaluate how many specialized shuffle instructions are involved in | |||
| 42838 | // a particular chain. | |||
| 42839 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 42840 | return Res; | |||
| 42841 | ||||
| 42842 | // Simplify source operands based on shuffle mask. | |||
| 42843 | // TODO - merge this into combineX86ShufflesRecursively. | |||
| 42844 | APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); | |||
| 42845 | if (TLI.SimplifyDemandedVectorElts(Op, DemandedElts, DCI)) | |||
| 42846 | return SDValue(N, 0); | |||
| 42847 | ||||
| 42848 | // Canonicalize SHUFFLE(BINOP(X,Y)) -> BINOP(SHUFFLE(X),SHUFFLE(Y)). | |||
| 42849 | // Perform this after other shuffle combines to allow inner shuffles to be | |||
| 42850 | // combined away first. | |||
| 42851 | if (SDValue BinOp = canonicalizeShuffleWithBinOps(Op, DAG, dl)) | |||
| 42852 | return BinOp; | |||
| 42853 | } | |||
| 42854 | ||||
| 42855 | return SDValue(); | |||
| 42856 | } | |||
| 42857 | ||||
| 42858 | // Simplify variable target shuffle masks based on the demanded elements. | |||
| 42859 | // TODO: Handle DemandedBits in mask indices as well? | |||
| 42860 | bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle( | |||
| 42861 | SDValue Op, const APInt &DemandedElts, unsigned MaskIndex, | |||
| 42862 | TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const { | |||
| 42863 | // If we're demanding all elements don't bother trying to simplify the mask. | |||
| 42864 | unsigned NumElts = DemandedElts.getBitWidth(); | |||
| 42865 | if (DemandedElts.isAllOnes()) | |||
| 42866 | return false; | |||
| 42867 | ||||
| 42868 | SDValue Mask = Op.getOperand(MaskIndex); | |||
| 42869 | if (!Mask.hasOneUse()) | |||
| 42870 | return false; | |||
| 42871 | ||||
| 42872 | // Attempt to generically simplify the variable shuffle mask. | |||
| 42873 | APInt MaskUndef, MaskZero; | |||
| 42874 | if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO, | |||
| 42875 | Depth + 1)) | |||
| 42876 | return true; | |||
| 42877 | ||||
| 42878 | // Attempt to extract+simplify a (constant pool load) shuffle mask. | |||
| 42879 | // TODO: Support other types from getTargetShuffleMaskIndices? | |||
| 42880 | SDValue BC = peekThroughOneUseBitcasts(Mask); | |||
| 42881 | EVT BCVT = BC.getValueType(); | |||
| 42882 | auto *Load = dyn_cast<LoadSDNode>(BC); | |||
| 42883 | if (!Load) | |||
| 42884 | return false; | |||
| 42885 | ||||
| 42886 | const Constant *C = getTargetConstantFromNode(Load); | |||
| 42887 | if (!C) | |||
| 42888 | return false; | |||
| 42889 | ||||
| 42890 | Type *CTy = C->getType(); | |||
| 42891 | if (!CTy->isVectorTy() || | |||
| 42892 | CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits()) | |||
| 42893 | return false; | |||
| 42894 | ||||
| 42895 | // Handle scaling for i64 elements on 32-bit targets. | |||
| 42896 | unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements(); | |||
| 42897 | if (NumCstElts != NumElts && NumCstElts != (NumElts * 2)) | |||
| 42898 | return false; | |||
| 42899 | unsigned Scale = NumCstElts / NumElts; | |||
| 42900 | ||||
| 42901 | // Simplify mask if we have an undemanded element that is not undef. | |||
| 42902 | bool Simplified = false; | |||
| 42903 | SmallVector<Constant *, 32> ConstVecOps; | |||
| 42904 | for (unsigned i = 0; i != NumCstElts; ++i) { | |||
| 42905 | Constant *Elt = C->getAggregateElement(i); | |||
| 42906 | if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) { | |||
| 42907 | ConstVecOps.push_back(UndefValue::get(Elt->getType())); | |||
| 42908 | Simplified = true; | |||
| 42909 | continue; | |||
| 42910 | } | |||
| 42911 | ConstVecOps.push_back(Elt); | |||
| 42912 | } | |||
| 42913 | if (!Simplified) | |||
| 42914 | return false; | |||
| 42915 | ||||
| 42916 | // Generate new constant pool entry + legalize immediately for the load. | |||
| 42917 | SDLoc DL(Op); | |||
| 42918 | SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT); | |||
| 42919 | SDValue LegalCV = LowerConstantPool(CV, TLO.DAG); | |||
| 42920 | SDValue NewMask = TLO.DAG.getLoad( | |||
| 42921 | BCVT, DL, TLO.DAG.getEntryNode(), LegalCV, | |||
| 42922 | MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()), | |||
| 42923 | Load->getAlign()); | |||
| 42924 | return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask)); | |||
| 42925 | } | |||
| 42926 | ||||
| 42927 | bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( | |||
| 42928 | SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero, | |||
| 42929 | TargetLoweringOpt &TLO, unsigned Depth) const { | |||
| 42930 | int NumElts = DemandedElts.getBitWidth(); | |||
| 42931 | unsigned Opc = Op.getOpcode(); | |||
| 42932 | EVT VT = Op.getValueType(); | |||
| 42933 | ||||
| 42934 | // Handle special case opcodes. | |||
| 42935 | switch (Opc) { | |||
| 42936 | case X86ISD::PMULDQ: | |||
| 42937 | case X86ISD::PMULUDQ: { | |||
| 42938 | APInt LHSUndef, LHSZero; | |||
| 42939 | APInt RHSUndef, RHSZero; | |||
| 42940 | SDValue LHS = Op.getOperand(0); | |||
| 42941 | SDValue RHS = Op.getOperand(1); | |||
| 42942 | if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, | |||
| 42943 | Depth + 1)) | |||
| 42944 | return true; | |||
| 42945 | if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, | |||
| 42946 | Depth + 1)) | |||
| 42947 | return true; | |||
| 42948 | // Multiply by zero. | |||
| 42949 | KnownZero = LHSZero | RHSZero; | |||
| 42950 | break; | |||
| 42951 | } | |||
| 42952 | case X86ISD::VPMADDWD: { | |||
| 42953 | APInt LHSUndef, LHSZero; | |||
| 42954 | APInt RHSUndef, RHSZero; | |||
| 42955 | SDValue LHS = Op.getOperand(0); | |||
| 42956 | SDValue RHS = Op.getOperand(1); | |||
| 42957 | APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, 2 * NumElts); | |||
| 42958 | ||||
| 42959 | if (SimplifyDemandedVectorElts(LHS, DemandedSrcElts, LHSUndef, LHSZero, TLO, | |||
| 42960 | Depth + 1)) | |||
| 42961 | return true; | |||
| 42962 | if (SimplifyDemandedVectorElts(RHS, DemandedSrcElts, RHSUndef, RHSZero, TLO, | |||
| 42963 | Depth + 1)) | |||
| 42964 | return true; | |||
| 42965 | ||||
| 42966 | // TODO: Multiply by zero. | |||
| 42967 | ||||
| 42968 | // If RHS/LHS elements are known zero then we don't need the LHS/RHS equivalent. | |||
| 42969 | APInt DemandedLHSElts = DemandedSrcElts & ~RHSZero; | |||
| 42970 | if (SimplifyDemandedVectorElts(LHS, DemandedLHSElts, LHSUndef, LHSZero, TLO, | |||
| 42971 | Depth + 1)) | |||
| 42972 | return true; | |||
| 42973 | APInt DemandedRHSElts = DemandedSrcElts & ~LHSZero; | |||
| 42974 | if (SimplifyDemandedVectorElts(RHS, DemandedRHSElts, RHSUndef, RHSZero, TLO, | |||
| 42975 | Depth + 1)) | |||
| 42976 | return true; | |||
| 42977 | break; | |||
| 42978 | } | |||
| 42979 | case X86ISD::PSADBW: { | |||
| 42980 | SDValue LHS = Op.getOperand(0); | |||
| 42981 | SDValue RHS = Op.getOperand(1); | |||
| 42982 | assert(VT.getScalarType() == MVT::i64 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType ().getScalarType() == MVT::i8 && "Unexpected PSADBW types" ) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__ __PRETTY_FUNCTION__)) | |||
| 42983 | LHS.getValueType() == RHS.getValueType() &&(static_cast <bool> (VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType ().getScalarType() == MVT::i8 && "Unexpected PSADBW types" ) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__ __PRETTY_FUNCTION__)) | |||
| 42984 | LHS.getValueType().getScalarType() == MVT::i8 &&(static_cast <bool> (VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType ().getScalarType() == MVT::i8 && "Unexpected PSADBW types" ) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__ __PRETTY_FUNCTION__)) | |||
| 42985 | "Unexpected PSADBW types")(static_cast <bool> (VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType ().getScalarType() == MVT::i8 && "Unexpected PSADBW types" ) ? void (0) : __assert_fail ("VT.getScalarType() == MVT::i64 && LHS.getValueType() == RHS.getValueType() && LHS.getValueType().getScalarType() == MVT::i8 && \"Unexpected PSADBW types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 42985, __extension__ __PRETTY_FUNCTION__)); | |||
| 42986 | ||||
| 42987 | // Aggressively peek through ops to get at the demanded elts. | |||
| 42988 | if (!DemandedElts.isAllOnes()) { | |||
| 42989 | unsigned NumSrcElts = LHS.getValueType().getVectorNumElements(); | |||
| 42990 | APInt DemandedSrcElts = APIntOps::ScaleBitMask(DemandedElts, NumSrcElts); | |||
| 42991 | SDValue NewLHS = SimplifyMultipleUseDemandedVectorElts( | |||
| 42992 | LHS, DemandedSrcElts, TLO.DAG, Depth + 1); | |||
| 42993 | SDValue NewRHS = SimplifyMultipleUseDemandedVectorElts( | |||
| 42994 | RHS, DemandedSrcElts, TLO.DAG, Depth + 1); | |||
| 42995 | if (NewLHS || NewRHS) { | |||
| 42996 | NewLHS = NewLHS ? NewLHS : LHS; | |||
| 42997 | NewRHS = NewRHS ? NewRHS : RHS; | |||
| 42998 | return TLO.CombineTo( | |||
| 42999 | Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS)); | |||
| 43000 | } | |||
| 43001 | } | |||
| 43002 | break; | |||
| 43003 | } | |||
| 43004 | case X86ISD::VSHL: | |||
| 43005 | case X86ISD::VSRL: | |||
| 43006 | case X86ISD::VSRA: { | |||
| 43007 | // We only need the bottom 64-bits of the (128-bit) shift amount. | |||
| 43008 | SDValue Amt = Op.getOperand(1); | |||
| 43009 | MVT AmtVT = Amt.getSimpleValueType(); | |||
| 43010 | assert(AmtVT.is128BitVector() && "Unexpected value type")(static_cast <bool> (AmtVT.is128BitVector() && "Unexpected value type" ) ? void (0) : __assert_fail ("AmtVT.is128BitVector() && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43010, __extension__ __PRETTY_FUNCTION__)); | |||
| 43011 | ||||
| 43012 | // If we reuse the shift amount just for sse shift amounts then we know that | |||
| 43013 | // only the bottom 64-bits are only ever used. | |||
| 43014 | bool AssumeSingleUse = llvm::all_of(Amt->uses(), [&Amt](SDNode *Use) { | |||
| 43015 | unsigned UseOpc = Use->getOpcode(); | |||
| 43016 | return (UseOpc == X86ISD::VSHL || UseOpc == X86ISD::VSRL || | |||
| 43017 | UseOpc == X86ISD::VSRA) && | |||
| 43018 | Use->getOperand(0) != Amt; | |||
| 43019 | }); | |||
| 43020 | ||||
| 43021 | APInt AmtUndef, AmtZero; | |||
| 43022 | unsigned NumAmtElts = AmtVT.getVectorNumElements(); | |||
| 43023 | APInt AmtElts = APInt::getLowBitsSet(NumAmtElts, NumAmtElts / 2); | |||
| 43024 | if (SimplifyDemandedVectorElts(Amt, AmtElts, AmtUndef, AmtZero, TLO, | |||
| 43025 | Depth + 1, AssumeSingleUse)) | |||
| 43026 | return true; | |||
| 43027 | [[fallthrough]]; | |||
| 43028 | } | |||
| 43029 | case X86ISD::VSHLI: | |||
| 43030 | case X86ISD::VSRLI: | |||
| 43031 | case X86ISD::VSRAI: { | |||
| 43032 | SDValue Src = Op.getOperand(0); | |||
| 43033 | APInt SrcUndef; | |||
| 43034 | if (SimplifyDemandedVectorElts(Src, DemandedElts, SrcUndef, KnownZero, TLO, | |||
| 43035 | Depth + 1)) | |||
| 43036 | return true; | |||
| 43037 | ||||
| 43038 | // Fold shift(0,x) -> 0 | |||
| 43039 | if (DemandedElts.isSubsetOf(KnownZero)) | |||
| 43040 | return TLO.CombineTo( | |||
| 43041 | Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); | |||
| 43042 | ||||
| 43043 | // Aggressively peek through ops to get at the demanded elts. | |||
| 43044 | if (!DemandedElts.isAllOnes()) | |||
| 43045 | if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( | |||
| 43046 | Src, DemandedElts, TLO.DAG, Depth + 1)) | |||
| 43047 | return TLO.CombineTo( | |||
| 43048 | Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc, Op.getOperand(1))); | |||
| 43049 | break; | |||
| 43050 | } | |||
| 43051 | case X86ISD::VPSHA: | |||
| 43052 | case X86ISD::VPSHL: | |||
| 43053 | case X86ISD::VSHLV: | |||
| 43054 | case X86ISD::VSRLV: | |||
| 43055 | case X86ISD::VSRAV: { | |||
| 43056 | APInt LHSUndef, LHSZero; | |||
| 43057 | APInt RHSUndef, RHSZero; | |||
| 43058 | SDValue LHS = Op.getOperand(0); | |||
| 43059 | SDValue RHS = Op.getOperand(1); | |||
| 43060 | if (SimplifyDemandedVectorElts(LHS, DemandedElts, LHSUndef, LHSZero, TLO, | |||
| 43061 | Depth + 1)) | |||
| 43062 | return true; | |||
| 43063 | ||||
| 43064 | // Fold shift(0,x) -> 0 | |||
| 43065 | if (DemandedElts.isSubsetOf(LHSZero)) | |||
| 43066 | return TLO.CombineTo( | |||
| 43067 | Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); | |||
| 43068 | ||||
| 43069 | if (SimplifyDemandedVectorElts(RHS, DemandedElts, RHSUndef, RHSZero, TLO, | |||
| 43070 | Depth + 1)) | |||
| 43071 | return true; | |||
| 43072 | ||||
| 43073 | KnownZero = LHSZero; | |||
| 43074 | break; | |||
| 43075 | } | |||
| 43076 | case X86ISD::KSHIFTL: { | |||
| 43077 | SDValue Src = Op.getOperand(0); | |||
| 43078 | auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); | |||
| 43079 | assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts ) && "Out of range shift amount") ? void (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43079, __extension__ __PRETTY_FUNCTION__)); | |||
| 43080 | unsigned ShiftAmt = Amt->getZExtValue(); | |||
| 43081 | ||||
| 43082 | if (ShiftAmt == 0) | |||
| 43083 | return TLO.CombineTo(Op, Src); | |||
| 43084 | ||||
| 43085 | // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a | |||
| 43086 | // single shift. We can do this if the bottom bits (which are shifted | |||
| 43087 | // out) are never demanded. | |||
| 43088 | if (Src.getOpcode() == X86ISD::KSHIFTR) { | |||
| 43089 | if (!DemandedElts.intersects(APInt::getLowBitsSet(NumElts, ShiftAmt))) { | |||
| 43090 | unsigned C1 = Src.getConstantOperandVal(1); | |||
| 43091 | unsigned NewOpc = X86ISD::KSHIFTL; | |||
| 43092 | int Diff = ShiftAmt - C1; | |||
| 43093 | if (Diff < 0) { | |||
| 43094 | Diff = -Diff; | |||
| 43095 | NewOpc = X86ISD::KSHIFTR; | |||
| 43096 | } | |||
| 43097 | ||||
| 43098 | SDLoc dl(Op); | |||
| 43099 | SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); | |||
| 43100 | return TLO.CombineTo( | |||
| 43101 | Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); | |||
| 43102 | } | |||
| 43103 | } | |||
| 43104 | ||||
| 43105 | APInt DemandedSrc = DemandedElts.lshr(ShiftAmt); | |||
| 43106 | if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, | |||
| 43107 | Depth + 1)) | |||
| 43108 | return true; | |||
| 43109 | ||||
| 43110 | KnownUndef <<= ShiftAmt; | |||
| 43111 | KnownZero <<= ShiftAmt; | |||
| 43112 | KnownZero.setLowBits(ShiftAmt); | |||
| 43113 | break; | |||
| 43114 | } | |||
| 43115 | case X86ISD::KSHIFTR: { | |||
| 43116 | SDValue Src = Op.getOperand(0); | |||
| 43117 | auto *Amt = cast<ConstantSDNode>(Op.getOperand(1)); | |||
| 43118 | assert(Amt->getAPIntValue().ult(NumElts) && "Out of range shift amount")(static_cast <bool> (Amt->getAPIntValue().ult(NumElts ) && "Out of range shift amount") ? void (0) : __assert_fail ("Amt->getAPIntValue().ult(NumElts) && \"Out of range shift amount\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43118, __extension__ __PRETTY_FUNCTION__)); | |||
| 43119 | unsigned ShiftAmt = Amt->getZExtValue(); | |||
| 43120 | ||||
| 43121 | if (ShiftAmt == 0) | |||
| 43122 | return TLO.CombineTo(Op, Src); | |||
| 43123 | ||||
| 43124 | // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a | |||
| 43125 | // single shift. We can do this if the top bits (which are shifted | |||
| 43126 | // out) are never demanded. | |||
| 43127 | if (Src.getOpcode() == X86ISD::KSHIFTL) { | |||
| 43128 | if (!DemandedElts.intersects(APInt::getHighBitsSet(NumElts, ShiftAmt))) { | |||
| 43129 | unsigned C1 = Src.getConstantOperandVal(1); | |||
| 43130 | unsigned NewOpc = X86ISD::KSHIFTR; | |||
| 43131 | int Diff = ShiftAmt - C1; | |||
| 43132 | if (Diff < 0) { | |||
| 43133 | Diff = -Diff; | |||
| 43134 | NewOpc = X86ISD::KSHIFTL; | |||
| 43135 | } | |||
| 43136 | ||||
| 43137 | SDLoc dl(Op); | |||
| 43138 | SDValue NewSA = TLO.DAG.getTargetConstant(Diff, dl, MVT::i8); | |||
| 43139 | return TLO.CombineTo( | |||
| 43140 | Op, TLO.DAG.getNode(NewOpc, dl, VT, Src.getOperand(0), NewSA)); | |||
| 43141 | } | |||
| 43142 | } | |||
| 43143 | ||||
| 43144 | APInt DemandedSrc = DemandedElts.shl(ShiftAmt); | |||
| 43145 | if (SimplifyDemandedVectorElts(Src, DemandedSrc, KnownUndef, KnownZero, TLO, | |||
| 43146 | Depth + 1)) | |||
| 43147 | return true; | |||
| 43148 | ||||
| 43149 | KnownUndef.lshrInPlace(ShiftAmt); | |||
| 43150 | KnownZero.lshrInPlace(ShiftAmt); | |||
| 43151 | KnownZero.setHighBits(ShiftAmt); | |||
| 43152 | break; | |||
| 43153 | } | |||
| 43154 | case X86ISD::ANDNP: { | |||
| 43155 | // ANDNP = (~LHS & RHS); | |||
| 43156 | SDValue LHS = Op.getOperand(0); | |||
| 43157 | SDValue RHS = Op.getOperand(1); | |||
| 43158 | ||||
| 43159 | auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { | |||
| 43160 | APInt UndefElts; | |||
| 43161 | SmallVector<APInt> EltBits; | |||
| 43162 | int NumElts = VT.getVectorNumElements(); | |||
| 43163 | int EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 43164 | APInt OpBits = APInt::getAllOnes(EltSizeInBits); | |||
| 43165 | APInt OpElts = DemandedElts; | |||
| 43166 | if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, | |||
| 43167 | EltBits)) { | |||
| 43168 | OpBits.clearAllBits(); | |||
| 43169 | OpElts.clearAllBits(); | |||
| 43170 | for (int I = 0; I != NumElts; ++I) { | |||
| 43171 | if (!DemandedElts[I]) | |||
| 43172 | continue; | |||
| 43173 | if (UndefElts[I]) { | |||
| 43174 | // We can't assume an undef src element gives an undef dst - the | |||
| 43175 | // other src might be zero. | |||
| 43176 | OpBits.setAllBits(); | |||
| 43177 | OpElts.setBit(I); | |||
| 43178 | } else if ((Invert && !EltBits[I].isAllOnes()) || | |||
| 43179 | (!Invert && !EltBits[I].isZero())) { | |||
| 43180 | OpBits |= Invert ? ~EltBits[I] : EltBits[I]; | |||
| 43181 | OpElts.setBit(I); | |||
| 43182 | } | |||
| 43183 | } | |||
| 43184 | } | |||
| 43185 | return std::make_pair(OpBits, OpElts); | |||
| 43186 | }; | |||
| 43187 | APInt BitsLHS, EltsLHS; | |||
| 43188 | APInt BitsRHS, EltsRHS; | |||
| 43189 | std::tie(BitsLHS, EltsLHS) = GetDemandedMasks(RHS); | |||
| 43190 | std::tie(BitsRHS, EltsRHS) = GetDemandedMasks(LHS, true); | |||
| 43191 | ||||
| 43192 | APInt LHSUndef, LHSZero; | |||
| 43193 | APInt RHSUndef, RHSZero; | |||
| 43194 | if (SimplifyDemandedVectorElts(LHS, EltsLHS, LHSUndef, LHSZero, TLO, | |||
| 43195 | Depth + 1)) | |||
| 43196 | return true; | |||
| 43197 | if (SimplifyDemandedVectorElts(RHS, EltsRHS, RHSUndef, RHSZero, TLO, | |||
| 43198 | Depth + 1)) | |||
| 43199 | return true; | |||
| 43200 | ||||
| 43201 | if (!DemandedElts.isAllOnes()) { | |||
| 43202 | SDValue NewLHS = SimplifyMultipleUseDemandedBits(LHS, BitsLHS, EltsLHS, | |||
| 43203 | TLO.DAG, Depth + 1); | |||
| 43204 | SDValue NewRHS = SimplifyMultipleUseDemandedBits(RHS, BitsRHS, EltsRHS, | |||
| 43205 | TLO.DAG, Depth + 1); | |||
| 43206 | if (NewLHS || NewRHS) { | |||
| 43207 | NewLHS = NewLHS ? NewLHS : LHS; | |||
| 43208 | NewRHS = NewRHS ? NewRHS : RHS; | |||
| 43209 | return TLO.CombineTo( | |||
| 43210 | Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewLHS, NewRHS)); | |||
| 43211 | } | |||
| 43212 | } | |||
| 43213 | break; | |||
| 43214 | } | |||
| 43215 | case X86ISD::CVTSI2P: | |||
| 43216 | case X86ISD::CVTUI2P: { | |||
| 43217 | SDValue Src = Op.getOperand(0); | |||
| 43218 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43219 | APInt SrcUndef, SrcZero; | |||
| 43220 | APInt SrcElts = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); | |||
| 43221 | if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, | |||
| 43222 | Depth + 1)) | |||
| 43223 | return true; | |||
| 43224 | break; | |||
| 43225 | } | |||
| 43226 | case X86ISD::PACKSS: | |||
| 43227 | case X86ISD::PACKUS: { | |||
| 43228 | SDValue N0 = Op.getOperand(0); | |||
| 43229 | SDValue N1 = Op.getOperand(1); | |||
| 43230 | ||||
| 43231 | APInt DemandedLHS, DemandedRHS; | |||
| 43232 | getPackDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); | |||
| 43233 | ||||
| 43234 | APInt LHSUndef, LHSZero; | |||
| 43235 | if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, | |||
| 43236 | Depth + 1)) | |||
| 43237 | return true; | |||
| 43238 | APInt RHSUndef, RHSZero; | |||
| 43239 | if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO, | |||
| 43240 | Depth + 1)) | |||
| 43241 | return true; | |||
| 43242 | ||||
| 43243 | // TODO - pass on known zero/undef. | |||
| 43244 | ||||
| 43245 | // Aggressively peek through ops to get at the demanded elts. | |||
| 43246 | // TODO - we should do this for all target/faux shuffles ops. | |||
| 43247 | if (!DemandedElts.isAllOnes()) { | |||
| 43248 | SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS, | |||
| 43249 | TLO.DAG, Depth + 1); | |||
| 43250 | SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS, | |||
| 43251 | TLO.DAG, Depth + 1); | |||
| 43252 | if (NewN0 || NewN1) { | |||
| 43253 | NewN0 = NewN0 ? NewN0 : N0; | |||
| 43254 | NewN1 = NewN1 ? NewN1 : N1; | |||
| 43255 | return TLO.CombineTo(Op, | |||
| 43256 | TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); | |||
| 43257 | } | |||
| 43258 | } | |||
| 43259 | break; | |||
| 43260 | } | |||
| 43261 | case X86ISD::HADD: | |||
| 43262 | case X86ISD::HSUB: | |||
| 43263 | case X86ISD::FHADD: | |||
| 43264 | case X86ISD::FHSUB: { | |||
| 43265 | SDValue N0 = Op.getOperand(0); | |||
| 43266 | SDValue N1 = Op.getOperand(1); | |||
| 43267 | ||||
| 43268 | APInt DemandedLHS, DemandedRHS; | |||
| 43269 | getHorizDemandedElts(VT, DemandedElts, DemandedLHS, DemandedRHS); | |||
| 43270 | ||||
| 43271 | APInt LHSUndef, LHSZero; | |||
| 43272 | if (SimplifyDemandedVectorElts(N0, DemandedLHS, LHSUndef, LHSZero, TLO, | |||
| 43273 | Depth + 1)) | |||
| 43274 | return true; | |||
| 43275 | APInt RHSUndef, RHSZero; | |||
| 43276 | if (SimplifyDemandedVectorElts(N1, DemandedRHS, RHSUndef, RHSZero, TLO, | |||
| 43277 | Depth + 1)) | |||
| 43278 | return true; | |||
| 43279 | ||||
| 43280 | // TODO - pass on known zero/undef. | |||
| 43281 | ||||
| 43282 | // Aggressively peek through ops to get at the demanded elts. | |||
| 43283 | // TODO: Handle repeated operands. | |||
| 43284 | if (N0 != N1 && !DemandedElts.isAllOnes()) { | |||
| 43285 | SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS, | |||
| 43286 | TLO.DAG, Depth + 1); | |||
| 43287 | SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS, | |||
| 43288 | TLO.DAG, Depth + 1); | |||
| 43289 | if (NewN0 || NewN1) { | |||
| 43290 | NewN0 = NewN0 ? NewN0 : N0; | |||
| 43291 | NewN1 = NewN1 ? NewN1 : N1; | |||
| 43292 | return TLO.CombineTo(Op, | |||
| 43293 | TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewN0, NewN1)); | |||
| 43294 | } | |||
| 43295 | } | |||
| 43296 | break; | |||
| 43297 | } | |||
| 43298 | case X86ISD::VTRUNC: | |||
| 43299 | case X86ISD::VTRUNCS: | |||
| 43300 | case X86ISD::VTRUNCUS: { | |||
| 43301 | SDValue Src = Op.getOperand(0); | |||
| 43302 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43303 | APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements()); | |||
| 43304 | APInt SrcUndef, SrcZero; | |||
| 43305 | if (SimplifyDemandedVectorElts(Src, DemandedSrc, SrcUndef, SrcZero, TLO, | |||
| 43306 | Depth + 1)) | |||
| 43307 | return true; | |||
| 43308 | KnownZero = SrcZero.zextOrTrunc(NumElts); | |||
| 43309 | KnownUndef = SrcUndef.zextOrTrunc(NumElts); | |||
| 43310 | break; | |||
| 43311 | } | |||
| 43312 | case X86ISD::BLENDV: { | |||
| 43313 | APInt SelUndef, SelZero; | |||
| 43314 | if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, SelUndef, | |||
| 43315 | SelZero, TLO, Depth + 1)) | |||
| 43316 | return true; | |||
| 43317 | ||||
| 43318 | // TODO: Use SelZero to adjust LHS/RHS DemandedElts. | |||
| 43319 | APInt LHSUndef, LHSZero; | |||
| 43320 | if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, LHSUndef, | |||
| 43321 | LHSZero, TLO, Depth + 1)) | |||
| 43322 | return true; | |||
| 43323 | ||||
| 43324 | APInt RHSUndef, RHSZero; | |||
| 43325 | if (SimplifyDemandedVectorElts(Op.getOperand(2), DemandedElts, RHSUndef, | |||
| 43326 | RHSZero, TLO, Depth + 1)) | |||
| 43327 | return true; | |||
| 43328 | ||||
| 43329 | KnownZero = LHSZero & RHSZero; | |||
| 43330 | KnownUndef = LHSUndef & RHSUndef; | |||
| 43331 | break; | |||
| 43332 | } | |||
| 43333 | case X86ISD::VZEXT_MOVL: { | |||
| 43334 | // If upper demanded elements are already zero then we have nothing to do. | |||
| 43335 | SDValue Src = Op.getOperand(0); | |||
| 43336 | APInt DemandedUpperElts = DemandedElts; | |||
| 43337 | DemandedUpperElts.clearLowBits(1); | |||
| 43338 | if (TLO.DAG.MaskedVectorIsZero(Src, DemandedUpperElts, Depth + 1)) | |||
| 43339 | return TLO.CombineTo(Op, Src); | |||
| 43340 | break; | |||
| 43341 | } | |||
| 43342 | case X86ISD::VBROADCAST: { | |||
| 43343 | SDValue Src = Op.getOperand(0); | |||
| 43344 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43345 | if (!SrcVT.isVector()) | |||
| 43346 | break; | |||
| 43347 | // Don't bother broadcasting if we just need the 0'th element. | |||
| 43348 | if (DemandedElts == 1) { | |||
| 43349 | if (Src.getValueType() != VT) | |||
| 43350 | Src = widenSubVector(VT.getSimpleVT(), Src, false, Subtarget, TLO.DAG, | |||
| 43351 | SDLoc(Op)); | |||
| 43352 | return TLO.CombineTo(Op, Src); | |||
| 43353 | } | |||
| 43354 | APInt SrcUndef, SrcZero; | |||
| 43355 | APInt SrcElts = APInt::getOneBitSet(SrcVT.getVectorNumElements(), 0); | |||
| 43356 | if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO, | |||
| 43357 | Depth + 1)) | |||
| 43358 | return true; | |||
| 43359 | // Aggressively peek through src to get at the demanded elt. | |||
| 43360 | // TODO - we should do this for all target/faux shuffles ops. | |||
| 43361 | if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts( | |||
| 43362 | Src, SrcElts, TLO.DAG, Depth + 1)) | |||
| 43363 | return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); | |||
| 43364 | break; | |||
| 43365 | } | |||
| 43366 | case X86ISD::VPERMV: | |||
| 43367 | if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO, | |||
| 43368 | Depth)) | |||
| 43369 | return true; | |||
| 43370 | break; | |||
| 43371 | case X86ISD::PSHUFB: | |||
| 43372 | case X86ISD::VPERMV3: | |||
| 43373 | case X86ISD::VPERMILPV: | |||
| 43374 | if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO, | |||
| 43375 | Depth)) | |||
| 43376 | return true; | |||
| 43377 | break; | |||
| 43378 | case X86ISD::VPPERM: | |||
| 43379 | case X86ISD::VPERMIL2: | |||
| 43380 | if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO, | |||
| 43381 | Depth)) | |||
| 43382 | return true; | |||
| 43383 | break; | |||
| 43384 | } | |||
| 43385 | ||||
| 43386 | // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not | |||
| 43387 | // demand any of the high elements, then narrow the op to 128/256-bits: e.g. | |||
| 43388 | // (op ymm0, ymm1) --> insert undef, (op xmm0, xmm1), 0 | |||
| 43389 | if ((VT.is256BitVector() || VT.is512BitVector()) && | |||
| 43390 | DemandedElts.lshr(NumElts / 2) == 0) { | |||
| 43391 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 43392 | unsigned ExtSizeInBits = SizeInBits / 2; | |||
| 43393 | ||||
| 43394 | // See if 512-bit ops only use the bottom 128-bits. | |||
| 43395 | if (VT.is512BitVector() && DemandedElts.lshr(NumElts / 4) == 0) | |||
| 43396 | ExtSizeInBits = SizeInBits / 4; | |||
| 43397 | ||||
| 43398 | switch (Opc) { | |||
| 43399 | // Scalar broadcast. | |||
| 43400 | case X86ISD::VBROADCAST: { | |||
| 43401 | SDLoc DL(Op); | |||
| 43402 | SDValue Src = Op.getOperand(0); | |||
| 43403 | if (Src.getValueSizeInBits() > ExtSizeInBits) | |||
| 43404 | Src = extractSubVector(Src, 0, TLO.DAG, DL, ExtSizeInBits); | |||
| 43405 | EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), | |||
| 43406 | ExtSizeInBits / VT.getScalarSizeInBits()); | |||
| 43407 | SDValue Bcst = TLO.DAG.getNode(X86ISD::VBROADCAST, DL, BcstVT, Src); | |||
| 43408 | return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, | |||
| 43409 | TLO.DAG, DL, ExtSizeInBits)); | |||
| 43410 | } | |||
| 43411 | case X86ISD::VBROADCAST_LOAD: { | |||
| 43412 | SDLoc DL(Op); | |||
| 43413 | auto *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 43414 | EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), | |||
| 43415 | ExtSizeInBits / VT.getScalarSizeInBits()); | |||
| 43416 | SDVTList Tys = TLO.DAG.getVTList(BcstVT, MVT::Other); | |||
| 43417 | SDValue Ops[] = {MemIntr->getOperand(0), MemIntr->getOperand(1)}; | |||
| 43418 | SDValue Bcst = TLO.DAG.getMemIntrinsicNode( | |||
| 43419 | X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(), | |||
| 43420 | MemIntr->getMemOperand()); | |||
| 43421 | TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), | |||
| 43422 | Bcst.getValue(1)); | |||
| 43423 | return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Bcst, 0, | |||
| 43424 | TLO.DAG, DL, ExtSizeInBits)); | |||
| 43425 | } | |||
| 43426 | // Subvector broadcast. | |||
| 43427 | case X86ISD::SUBV_BROADCAST_LOAD: { | |||
| 43428 | auto *MemIntr = cast<MemIntrinsicSDNode>(Op); | |||
| 43429 | EVT MemVT = MemIntr->getMemoryVT(); | |||
| 43430 | if (ExtSizeInBits == MemVT.getStoreSizeInBits()) { | |||
| 43431 | SDLoc DL(Op); | |||
| 43432 | SDValue Ld = | |||
| 43433 | TLO.DAG.getLoad(MemVT, DL, MemIntr->getChain(), | |||
| 43434 | MemIntr->getBasePtr(), MemIntr->getMemOperand()); | |||
| 43435 | TLO.DAG.makeEquivalentMemoryOrdering(SDValue(MemIntr, 1), | |||
| 43436 | Ld.getValue(1)); | |||
| 43437 | return TLO.CombineTo(Op, insertSubVector(TLO.DAG.getUNDEF(VT), Ld, 0, | |||
| 43438 | TLO.DAG, DL, ExtSizeInBits)); | |||
| 43439 | } else if ((ExtSizeInBits % MemVT.getStoreSizeInBits()) == 0) { | |||
| 43440 | SDLoc DL(Op); | |||
| 43441 | EVT BcstVT = EVT::getVectorVT(*TLO.DAG.getContext(), VT.getScalarType(), | |||
| 43442 | ExtSizeInBits / VT.getScalarSizeInBits()); | |||
| 43443 | if (SDValue BcstLd = | |||
| 43444 | getBROADCAST_LOAD(Opc, DL, BcstVT, MemVT, MemIntr, 0, TLO.DAG)) | |||
| 43445 | return TLO.CombineTo(Op, | |||
| 43446 | insertSubVector(TLO.DAG.getUNDEF(VT), BcstLd, 0, | |||
| 43447 | TLO.DAG, DL, ExtSizeInBits)); | |||
| 43448 | } | |||
| 43449 | break; | |||
| 43450 | } | |||
| 43451 | // Byte shifts by immediate. | |||
| 43452 | case X86ISD::VSHLDQ: | |||
| 43453 | case X86ISD::VSRLDQ: | |||
| 43454 | // Shift by uniform. | |||
| 43455 | case X86ISD::VSHL: | |||
| 43456 | case X86ISD::VSRL: | |||
| 43457 | case X86ISD::VSRA: | |||
| 43458 | // Shift by immediate. | |||
| 43459 | case X86ISD::VSHLI: | |||
| 43460 | case X86ISD::VSRLI: | |||
| 43461 | case X86ISD::VSRAI: { | |||
| 43462 | SDLoc DL(Op); | |||
| 43463 | SDValue Ext0 = | |||
| 43464 | extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits); | |||
| 43465 | SDValue ExtOp = | |||
| 43466 | TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0, Op.getOperand(1)); | |||
| 43467 | SDValue UndefVec = TLO.DAG.getUNDEF(VT); | |||
| 43468 | SDValue Insert = | |||
| 43469 | insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); | |||
| 43470 | return TLO.CombineTo(Op, Insert); | |||
| 43471 | } | |||
| 43472 | case X86ISD::VPERMI: { | |||
| 43473 | // Simplify PERMPD/PERMQ to extract_subvector. | |||
| 43474 | // TODO: This should be done in shuffle combining. | |||
| 43475 | if (VT == MVT::v4f64 || VT == MVT::v4i64) { | |||
| 43476 | SmallVector<int, 4> Mask; | |||
| 43477 | DecodeVPERMMask(NumElts, Op.getConstantOperandVal(1), Mask); | |||
| 43478 | if (isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3)) { | |||
| 43479 | SDLoc DL(Op); | |||
| 43480 | SDValue Ext = extractSubVector(Op.getOperand(0), 2, TLO.DAG, DL, 128); | |||
| 43481 | SDValue UndefVec = TLO.DAG.getUNDEF(VT); | |||
| 43482 | SDValue Insert = insertSubVector(UndefVec, Ext, 0, TLO.DAG, DL, 128); | |||
| 43483 | return TLO.CombineTo(Op, Insert); | |||
| 43484 | } | |||
| 43485 | } | |||
| 43486 | break; | |||
| 43487 | } | |||
| 43488 | case X86ISD::VPERM2X128: { | |||
| 43489 | // Simplify VPERM2F128/VPERM2I128 to extract_subvector. | |||
| 43490 | SDLoc DL(Op); | |||
| 43491 | unsigned LoMask = Op.getConstantOperandVal(2) & 0xF; | |||
| 43492 | if (LoMask & 0x8) | |||
| 43493 | return TLO.CombineTo( | |||
| 43494 | Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, DL)); | |||
| 43495 | unsigned EltIdx = (LoMask & 0x1) * (NumElts / 2); | |||
| 43496 | unsigned SrcIdx = (LoMask & 0x2) >> 1; | |||
| 43497 | SDValue ExtOp = | |||
| 43498 | extractSubVector(Op.getOperand(SrcIdx), EltIdx, TLO.DAG, DL, 128); | |||
| 43499 | SDValue UndefVec = TLO.DAG.getUNDEF(VT); | |||
| 43500 | SDValue Insert = | |||
| 43501 | insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); | |||
| 43502 | return TLO.CombineTo(Op, Insert); | |||
| 43503 | } | |||
| 43504 | // Zero upper elements. | |||
| 43505 | case X86ISD::VZEXT_MOVL: | |||
| 43506 | // Target unary shuffles by immediate: | |||
| 43507 | case X86ISD::PSHUFD: | |||
| 43508 | case X86ISD::PSHUFLW: | |||
| 43509 | case X86ISD::PSHUFHW: | |||
| 43510 | case X86ISD::VPERMILPI: | |||
| 43511 | // (Non-Lane Crossing) Target Shuffles. | |||
| 43512 | case X86ISD::VPERMILPV: | |||
| 43513 | case X86ISD::VPERMIL2: | |||
| 43514 | case X86ISD::PSHUFB: | |||
| 43515 | case X86ISD::UNPCKL: | |||
| 43516 | case X86ISD::UNPCKH: | |||
| 43517 | case X86ISD::BLENDI: | |||
| 43518 | // Integer ops. | |||
| 43519 | case X86ISD::PACKSS: | |||
| 43520 | case X86ISD::PACKUS: | |||
| 43521 | // Horizontal Ops. | |||
| 43522 | case X86ISD::HADD: | |||
| 43523 | case X86ISD::HSUB: | |||
| 43524 | case X86ISD::FHADD: | |||
| 43525 | case X86ISD::FHSUB: { | |||
| 43526 | SDLoc DL(Op); | |||
| 43527 | SmallVector<SDValue, 4> Ops; | |||
| 43528 | for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) { | |||
| 43529 | SDValue SrcOp = Op.getOperand(i); | |||
| 43530 | EVT SrcVT = SrcOp.getValueType(); | |||
| 43531 | assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits () == SizeInBits) && "Unsupported vector size") ? void (0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__ __PRETTY_FUNCTION__)) | |||
| 43532 | "Unsupported vector size")(static_cast <bool> ((!SrcVT.isVector() || SrcVT.getSizeInBits () == SizeInBits) && "Unsupported vector size") ? void (0) : __assert_fail ("(!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) && \"Unsupported vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43532, __extension__ __PRETTY_FUNCTION__)); | |||
| 43533 | Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL, | |||
| 43534 | ExtSizeInBits) | |||
| 43535 | : SrcOp); | |||
| 43536 | } | |||
| 43537 | MVT ExtVT = VT.getSimpleVT(); | |||
| 43538 | ExtVT = MVT::getVectorVT(ExtVT.getScalarType(), | |||
| 43539 | ExtSizeInBits / ExtVT.getScalarSizeInBits()); | |||
| 43540 | SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops); | |||
| 43541 | SDValue UndefVec = TLO.DAG.getUNDEF(VT); | |||
| 43542 | SDValue Insert = | |||
| 43543 | insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits); | |||
| 43544 | return TLO.CombineTo(Op, Insert); | |||
| 43545 | } | |||
| 43546 | } | |||
| 43547 | } | |||
| 43548 | ||||
| 43549 | // For splats, unless we *only* demand the 0'th element, | |||
| 43550 | // stop attempts at simplification here, we aren't going to improve things, | |||
| 43551 | // this is better than any potential shuffle. | |||
| 43552 | if (!DemandedElts.isOne() && TLO.DAG.isSplatValue(Op, /*AllowUndefs*/false)) | |||
| 43553 | return false; | |||
| 43554 | ||||
| 43555 | // Get target/faux shuffle mask. | |||
| 43556 | APInt OpUndef, OpZero; | |||
| 43557 | SmallVector<int, 64> OpMask; | |||
| 43558 | SmallVector<SDValue, 2> OpInputs; | |||
| 43559 | if (!getTargetShuffleInputs(Op, DemandedElts, OpInputs, OpMask, OpUndef, | |||
| 43560 | OpZero, TLO.DAG, Depth, false)) | |||
| 43561 | return false; | |||
| 43562 | ||||
| 43563 | // Shuffle inputs must be the same size as the result. | |||
| 43564 | if (OpMask.size() != (unsigned)NumElts || | |||
| 43565 | llvm::any_of(OpInputs, [VT](SDValue V) { | |||
| 43566 | return VT.getSizeInBits() != V.getValueSizeInBits() || | |||
| 43567 | !V.getValueType().isVector(); | |||
| 43568 | })) | |||
| 43569 | return false; | |||
| 43570 | ||||
| 43571 | KnownZero = OpZero; | |||
| 43572 | KnownUndef = OpUndef; | |||
| 43573 | ||||
| 43574 | // Check if shuffle mask can be simplified to undef/zero/identity. | |||
| 43575 | int NumSrcs = OpInputs.size(); | |||
| 43576 | for (int i = 0; i != NumElts; ++i) | |||
| 43577 | if (!DemandedElts[i]) | |||
| 43578 | OpMask[i] = SM_SentinelUndef; | |||
| 43579 | ||||
| 43580 | if (isUndefInRange(OpMask, 0, NumElts)) { | |||
| 43581 | KnownUndef.setAllBits(); | |||
| 43582 | return TLO.CombineTo(Op, TLO.DAG.getUNDEF(VT)); | |||
| 43583 | } | |||
| 43584 | if (isUndefOrZeroInRange(OpMask, 0, NumElts)) { | |||
| 43585 | KnownZero.setAllBits(); | |||
| 43586 | return TLO.CombineTo( | |||
| 43587 | Op, getZeroVector(VT.getSimpleVT(), Subtarget, TLO.DAG, SDLoc(Op))); | |||
| 43588 | } | |||
| 43589 | for (int Src = 0; Src != NumSrcs; ++Src) | |||
| 43590 | if (isSequentialOrUndefInRange(OpMask, 0, NumElts, Src * NumElts)) | |||
| 43591 | return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, OpInputs[Src])); | |||
| 43592 | ||||
| 43593 | // Attempt to simplify inputs. | |||
| 43594 | for (int Src = 0; Src != NumSrcs; ++Src) { | |||
| 43595 | // TODO: Support inputs of different types. | |||
| 43596 | if (OpInputs[Src].getValueType() != VT) | |||
| 43597 | continue; | |||
| 43598 | ||||
| 43599 | int Lo = Src * NumElts; | |||
| 43600 | APInt SrcElts = APInt::getZero(NumElts); | |||
| 43601 | for (int i = 0; i != NumElts; ++i) | |||
| 43602 | if (DemandedElts[i]) { | |||
| 43603 | int M = OpMask[i] - Lo; | |||
| 43604 | if (0 <= M && M < NumElts) | |||
| 43605 | SrcElts.setBit(M); | |||
| 43606 | } | |||
| 43607 | ||||
| 43608 | // TODO - Propagate input undef/zero elts. | |||
| 43609 | APInt SrcUndef, SrcZero; | |||
| 43610 | if (SimplifyDemandedVectorElts(OpInputs[Src], SrcElts, SrcUndef, SrcZero, | |||
| 43611 | TLO, Depth + 1)) | |||
| 43612 | return true; | |||
| 43613 | } | |||
| 43614 | ||||
| 43615 | // If we don't demand all elements, then attempt to combine to a simpler | |||
| 43616 | // shuffle. | |||
| 43617 | // We need to convert the depth to something combineX86ShufflesRecursively | |||
| 43618 | // can handle - so pretend its Depth == 0 again, and reduce the max depth | |||
| 43619 | // to match. This prevents combineX86ShuffleChain from returning a | |||
| 43620 | // combined shuffle that's the same as the original root, causing an | |||
| 43621 | // infinite loop. | |||
| 43622 | if (!DemandedElts.isAllOnes()) { | |||
| 43623 | assert(Depth < X86::MaxShuffleCombineDepth && "Depth out of range")(static_cast <bool> (Depth < X86::MaxShuffleCombineDepth && "Depth out of range") ? void (0) : __assert_fail ( "Depth < X86::MaxShuffleCombineDepth && \"Depth out of range\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43623, __extension__ __PRETTY_FUNCTION__)); | |||
| 43624 | ||||
| 43625 | SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef); | |||
| 43626 | for (int i = 0; i != NumElts; ++i) | |||
| 43627 | if (DemandedElts[i]) | |||
| 43628 | DemandedMask[i] = i; | |||
| 43629 | ||||
| 43630 | SDValue NewShuffle = combineX86ShufflesRecursively( | |||
| 43631 | {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, | |||
| 43632 | /*HasVarMask*/ false, | |||
| 43633 | /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, | |||
| 43634 | Subtarget); | |||
| 43635 | if (NewShuffle) | |||
| 43636 | return TLO.CombineTo(Op, NewShuffle); | |||
| 43637 | } | |||
| 43638 | ||||
| 43639 | return false; | |||
| 43640 | } | |||
| 43641 | ||||
| 43642 | bool X86TargetLowering::SimplifyDemandedBitsForTargetNode( | |||
| 43643 | SDValue Op, const APInt &OriginalDemandedBits, | |||
| 43644 | const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, | |||
| 43645 | unsigned Depth) const { | |||
| 43646 | EVT VT = Op.getValueType(); | |||
| 43647 | unsigned BitWidth = OriginalDemandedBits.getBitWidth(); | |||
| 43648 | unsigned Opc = Op.getOpcode(); | |||
| 43649 | switch(Opc) { | |||
| 43650 | case X86ISD::VTRUNC: { | |||
| 43651 | KnownBits KnownOp; | |||
| 43652 | SDValue Src = Op.getOperand(0); | |||
| 43653 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43654 | ||||
| 43655 | // Simplify the input, using demanded bit information. | |||
| 43656 | APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits()); | |||
| 43657 | APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements()); | |||
| 43658 | if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1)) | |||
| 43659 | return true; | |||
| 43660 | break; | |||
| 43661 | } | |||
| 43662 | case X86ISD::PMULDQ: | |||
| 43663 | case X86ISD::PMULUDQ: { | |||
| 43664 | // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. | |||
| 43665 | KnownBits KnownLHS, KnownRHS; | |||
| 43666 | SDValue LHS = Op.getOperand(0); | |||
| 43667 | SDValue RHS = Op.getOperand(1); | |||
| 43668 | ||||
| 43669 | // Don't mask bits on 32-bit AVX512 targets which might lose a broadcast. | |||
| 43670 | // FIXME: Can we bound this better? | |||
| 43671 | APInt DemandedMask = APInt::getLowBitsSet(64, 32); | |||
| 43672 | APInt DemandedMaskLHS = APInt::getAllOnes(64); | |||
| 43673 | APInt DemandedMaskRHS = APInt::getAllOnes(64); | |||
| 43674 | ||||
| 43675 | bool Is32BitAVX512 = !Subtarget.is64Bit() && Subtarget.hasAVX512(); | |||
| 43676 | if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(LHS)) | |||
| 43677 | DemandedMaskLHS = DemandedMask; | |||
| 43678 | if (!Is32BitAVX512 || !TLO.DAG.isSplatValue(RHS)) | |||
| 43679 | DemandedMaskRHS = DemandedMask; | |||
| 43680 | ||||
| 43681 | if (SimplifyDemandedBits(LHS, DemandedMaskLHS, OriginalDemandedElts, | |||
| 43682 | KnownLHS, TLO, Depth + 1)) | |||
| 43683 | return true; | |||
| 43684 | if (SimplifyDemandedBits(RHS, DemandedMaskRHS, OriginalDemandedElts, | |||
| 43685 | KnownRHS, TLO, Depth + 1)) | |||
| 43686 | return true; | |||
| 43687 | ||||
| 43688 | // PMULUDQ(X,1) -> AND(X,(1<<32)-1) 'getZeroExtendInReg'. | |||
| 43689 | KnownRHS = KnownRHS.trunc(32); | |||
| 43690 | if (Opc == X86ISD::PMULUDQ && KnownRHS.isConstant() && | |||
| 43691 | KnownRHS.getConstant().isOne()) { | |||
| 43692 | SDLoc DL(Op); | |||
| 43693 | SDValue Mask = TLO.DAG.getConstant(DemandedMask, DL, VT); | |||
| 43694 | return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, DL, VT, LHS, Mask)); | |||
| 43695 | } | |||
| 43696 | ||||
| 43697 | // Aggressively peek through ops to get at the demanded low bits. | |||
| 43698 | SDValue DemandedLHS = SimplifyMultipleUseDemandedBits( | |||
| 43699 | LHS, DemandedMaskLHS, OriginalDemandedElts, TLO.DAG, Depth + 1); | |||
| 43700 | SDValue DemandedRHS = SimplifyMultipleUseDemandedBits( | |||
| 43701 | RHS, DemandedMaskRHS, OriginalDemandedElts, TLO.DAG, Depth + 1); | |||
| 43702 | if (DemandedLHS || DemandedRHS) { | |||
| 43703 | DemandedLHS = DemandedLHS ? DemandedLHS : LHS; | |||
| 43704 | DemandedRHS = DemandedRHS ? DemandedRHS : RHS; | |||
| 43705 | return TLO.CombineTo( | |||
| 43706 | Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, DemandedLHS, DemandedRHS)); | |||
| 43707 | } | |||
| 43708 | break; | |||
| 43709 | } | |||
| 43710 | case X86ISD::VSHLI: { | |||
| 43711 | SDValue Op0 = Op.getOperand(0); | |||
| 43712 | ||||
| 43713 | unsigned ShAmt = Op.getConstantOperandVal(1); | |||
| 43714 | if (ShAmt >= BitWidth) | |||
| 43715 | break; | |||
| 43716 | ||||
| 43717 | APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt); | |||
| 43718 | ||||
| 43719 | // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a | |||
| 43720 | // single shift. We can do this if the bottom bits (which are shifted | |||
| 43721 | // out) are never demanded. | |||
| 43722 | if (Op0.getOpcode() == X86ISD::VSRLI && | |||
| 43723 | OriginalDemandedBits.countr_zero() >= ShAmt) { | |||
| 43724 | unsigned Shift2Amt = Op0.getConstantOperandVal(1); | |||
| 43725 | if (Shift2Amt < BitWidth) { | |||
| 43726 | int Diff = ShAmt - Shift2Amt; | |||
| 43727 | if (Diff == 0) | |||
| 43728 | return TLO.CombineTo(Op, Op0.getOperand(0)); | |||
| 43729 | ||||
| 43730 | unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI; | |||
| 43731 | SDValue NewShift = TLO.DAG.getNode( | |||
| 43732 | NewOpc, SDLoc(Op), VT, Op0.getOperand(0), | |||
| 43733 | TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8)); | |||
| 43734 | return TLO.CombineTo(Op, NewShift); | |||
| 43735 | } | |||
| 43736 | } | |||
| 43737 | ||||
| 43738 | // If we are only demanding sign bits then we can use the shift source directly. | |||
| 43739 | unsigned NumSignBits = | |||
| 43740 | TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1); | |||
| 43741 | unsigned UpperDemandedBits = BitWidth - OriginalDemandedBits.countr_zero(); | |||
| 43742 | if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) | |||
| 43743 | return TLO.CombineTo(Op, Op0); | |||
| 43744 | ||||
| 43745 | if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, | |||
| 43746 | TLO, Depth + 1)) | |||
| 43747 | return true; | |||
| 43748 | ||||
| 43749 | assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?" ) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43749, __extension__ __PRETTY_FUNCTION__)); | |||
| 43750 | Known.Zero <<= ShAmt; | |||
| 43751 | Known.One <<= ShAmt; | |||
| 43752 | ||||
| 43753 | // Low bits known zero. | |||
| 43754 | Known.Zero.setLowBits(ShAmt); | |||
| 43755 | return false; | |||
| 43756 | } | |||
| 43757 | case X86ISD::VSRLI: { | |||
| 43758 | unsigned ShAmt = Op.getConstantOperandVal(1); | |||
| 43759 | if (ShAmt >= BitWidth) | |||
| 43760 | break; | |||
| 43761 | ||||
| 43762 | APInt DemandedMask = OriginalDemandedBits << ShAmt; | |||
| 43763 | ||||
| 43764 | if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask, | |||
| 43765 | OriginalDemandedElts, Known, TLO, Depth + 1)) | |||
| 43766 | return true; | |||
| 43767 | ||||
| 43768 | assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?" ) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43768, __extension__ __PRETTY_FUNCTION__)); | |||
| 43769 | Known.Zero.lshrInPlace(ShAmt); | |||
| 43770 | Known.One.lshrInPlace(ShAmt); | |||
| 43771 | ||||
| 43772 | // High bits known zero. | |||
| 43773 | Known.Zero.setHighBits(ShAmt); | |||
| 43774 | return false; | |||
| 43775 | } | |||
| 43776 | case X86ISD::VSRAI: { | |||
| 43777 | SDValue Op0 = Op.getOperand(0); | |||
| 43778 | SDValue Op1 = Op.getOperand(1); | |||
| 43779 | ||||
| 43780 | unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue(); | |||
| 43781 | if (ShAmt >= BitWidth) | |||
| 43782 | break; | |||
| 43783 | ||||
| 43784 | APInt DemandedMask = OriginalDemandedBits << ShAmt; | |||
| 43785 | ||||
| 43786 | // If we just want the sign bit then we don't need to shift it. | |||
| 43787 | if (OriginalDemandedBits.isSignMask()) | |||
| 43788 | return TLO.CombineTo(Op, Op0); | |||
| 43789 | ||||
| 43790 | // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 | |||
| 43791 | if (Op0.getOpcode() == X86ISD::VSHLI && | |||
| 43792 | Op.getOperand(1) == Op0.getOperand(1)) { | |||
| 43793 | SDValue Op00 = Op0.getOperand(0); | |||
| 43794 | unsigned NumSignBits = | |||
| 43795 | TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts); | |||
| 43796 | if (ShAmt < NumSignBits) | |||
| 43797 | return TLO.CombineTo(Op, Op00); | |||
| 43798 | } | |||
| 43799 | ||||
| 43800 | // If any of the demanded bits are produced by the sign extension, we also | |||
| 43801 | // demand the input sign bit. | |||
| 43802 | if (OriginalDemandedBits.countl_zero() < ShAmt) | |||
| 43803 | DemandedMask.setSignBit(); | |||
| 43804 | ||||
| 43805 | if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known, | |||
| 43806 | TLO, Depth + 1)) | |||
| 43807 | return true; | |||
| 43808 | ||||
| 43809 | assert(!Known.hasConflict() && "Bits known to be one AND zero?")(static_cast <bool> (!Known.hasConflict() && "Bits known to be one AND zero?" ) ? void (0) : __assert_fail ("!Known.hasConflict() && \"Bits known to be one AND zero?\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 43809, __extension__ __PRETTY_FUNCTION__)); | |||
| 43810 | Known.Zero.lshrInPlace(ShAmt); | |||
| 43811 | Known.One.lshrInPlace(ShAmt); | |||
| 43812 | ||||
| 43813 | // If the input sign bit is known to be zero, or if none of the top bits | |||
| 43814 | // are demanded, turn this into an unsigned shift right. | |||
| 43815 | if (Known.Zero[BitWidth - ShAmt - 1] || | |||
| 43816 | OriginalDemandedBits.countl_zero() >= ShAmt) | |||
| 43817 | return TLO.CombineTo( | |||
| 43818 | Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1)); | |||
| 43819 | ||||
| 43820 | // High bits are known one. | |||
| 43821 | if (Known.One[BitWidth - ShAmt - 1]) | |||
| 43822 | Known.One.setHighBits(ShAmt); | |||
| 43823 | return false; | |||
| 43824 | } | |||
| 43825 | case X86ISD::BLENDV: { | |||
| 43826 | SDValue Sel = Op.getOperand(0); | |||
| 43827 | SDValue LHS = Op.getOperand(1); | |||
| 43828 | SDValue RHS = Op.getOperand(2); | |||
| 43829 | ||||
| 43830 | APInt SignMask = APInt::getSignMask(BitWidth); | |||
| 43831 | SDValue NewSel = SimplifyMultipleUseDemandedBits( | |||
| 43832 | Sel, SignMask, OriginalDemandedElts, TLO.DAG, Depth + 1); | |||
| 43833 | SDValue NewLHS = SimplifyMultipleUseDemandedBits( | |||
| 43834 | LHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); | |||
| 43835 | SDValue NewRHS = SimplifyMultipleUseDemandedBits( | |||
| 43836 | RHS, OriginalDemandedBits, OriginalDemandedElts, TLO.DAG, Depth + 1); | |||
| 43837 | ||||
| 43838 | if (NewSel || NewLHS || NewRHS) { | |||
| 43839 | NewSel = NewSel ? NewSel : Sel; | |||
| 43840 | NewLHS = NewLHS ? NewLHS : LHS; | |||
| 43841 | NewRHS = NewRHS ? NewRHS : RHS; | |||
| 43842 | return TLO.CombineTo(Op, TLO.DAG.getNode(X86ISD::BLENDV, SDLoc(Op), VT, | |||
| 43843 | NewSel, NewLHS, NewRHS)); | |||
| 43844 | } | |||
| 43845 | break; | |||
| 43846 | } | |||
| 43847 | case X86ISD::PEXTRB: | |||
| 43848 | case X86ISD::PEXTRW: { | |||
| 43849 | SDValue Vec = Op.getOperand(0); | |||
| 43850 | auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1)); | |||
| 43851 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 43852 | unsigned NumVecElts = VecVT.getVectorNumElements(); | |||
| 43853 | ||||
| 43854 | if (CIdx && CIdx->getAPIntValue().ult(NumVecElts)) { | |||
| 43855 | unsigned Idx = CIdx->getZExtValue(); | |||
| 43856 | unsigned VecBitWidth = VecVT.getScalarSizeInBits(); | |||
| 43857 | ||||
| 43858 | // If we demand no bits from the vector then we must have demanded | |||
| 43859 | // bits from the implict zext - simplify to zero. | |||
| 43860 | APInt DemandedVecBits = OriginalDemandedBits.trunc(VecBitWidth); | |||
| 43861 | if (DemandedVecBits == 0) | |||
| 43862 | return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); | |||
| 43863 | ||||
| 43864 | APInt KnownUndef, KnownZero; | |||
| 43865 | APInt DemandedVecElts = APInt::getOneBitSet(NumVecElts, Idx); | |||
| 43866 | if (SimplifyDemandedVectorElts(Vec, DemandedVecElts, KnownUndef, | |||
| 43867 | KnownZero, TLO, Depth + 1)) | |||
| 43868 | return true; | |||
| 43869 | ||||
| 43870 | KnownBits KnownVec; | |||
| 43871 | if (SimplifyDemandedBits(Vec, DemandedVecBits, DemandedVecElts, | |||
| 43872 | KnownVec, TLO, Depth + 1)) | |||
| 43873 | return true; | |||
| 43874 | ||||
| 43875 | if (SDValue V = SimplifyMultipleUseDemandedBits( | |||
| 43876 | Vec, DemandedVecBits, DemandedVecElts, TLO.DAG, Depth + 1)) | |||
| 43877 | return TLO.CombineTo( | |||
| 43878 | Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1))); | |||
| 43879 | ||||
| 43880 | Known = KnownVec.zext(BitWidth); | |||
| 43881 | return false; | |||
| 43882 | } | |||
| 43883 | break; | |||
| 43884 | } | |||
| 43885 | case X86ISD::PINSRB: | |||
| 43886 | case X86ISD::PINSRW: { | |||
| 43887 | SDValue Vec = Op.getOperand(0); | |||
| 43888 | SDValue Scl = Op.getOperand(1); | |||
| 43889 | auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); | |||
| 43890 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 43891 | ||||
| 43892 | if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements())) { | |||
| 43893 | unsigned Idx = CIdx->getZExtValue(); | |||
| 43894 | if (!OriginalDemandedElts[Idx]) | |||
| 43895 | return TLO.CombineTo(Op, Vec); | |||
| 43896 | ||||
| 43897 | KnownBits KnownVec; | |||
| 43898 | APInt DemandedVecElts(OriginalDemandedElts); | |||
| 43899 | DemandedVecElts.clearBit(Idx); | |||
| 43900 | if (SimplifyDemandedBits(Vec, OriginalDemandedBits, DemandedVecElts, | |||
| 43901 | KnownVec, TLO, Depth + 1)) | |||
| 43902 | return true; | |||
| 43903 | ||||
| 43904 | KnownBits KnownScl; | |||
| 43905 | unsigned NumSclBits = Scl.getScalarValueSizeInBits(); | |||
| 43906 | APInt DemandedSclBits = OriginalDemandedBits.zext(NumSclBits); | |||
| 43907 | if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1)) | |||
| 43908 | return true; | |||
| 43909 | ||||
| 43910 | KnownScl = KnownScl.trunc(VecVT.getScalarSizeInBits()); | |||
| 43911 | Known = KnownBits::commonBits(KnownVec, KnownScl); | |||
| 43912 | return false; | |||
| 43913 | } | |||
| 43914 | break; | |||
| 43915 | } | |||
| 43916 | case X86ISD::PACKSS: | |||
| 43917 | // PACKSS saturates to MIN/MAX integer values. So if we just want the | |||
| 43918 | // sign bit then we can just ask for the source operands sign bit. | |||
| 43919 | // TODO - add known bits handling. | |||
| 43920 | if (OriginalDemandedBits.isSignMask()) { | |||
| 43921 | APInt DemandedLHS, DemandedRHS; | |||
| 43922 | getPackDemandedElts(VT, OriginalDemandedElts, DemandedLHS, DemandedRHS); | |||
| 43923 | ||||
| 43924 | KnownBits KnownLHS, KnownRHS; | |||
| 43925 | APInt SignMask = APInt::getSignMask(BitWidth * 2); | |||
| 43926 | if (SimplifyDemandedBits(Op.getOperand(0), SignMask, DemandedLHS, | |||
| 43927 | KnownLHS, TLO, Depth + 1)) | |||
| 43928 | return true; | |||
| 43929 | if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS, | |||
| 43930 | KnownRHS, TLO, Depth + 1)) | |||
| 43931 | return true; | |||
| 43932 | ||||
| 43933 | // Attempt to avoid multi-use ops if we don't need anything from them. | |||
| 43934 | SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits( | |||
| 43935 | Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1); | |||
| 43936 | SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits( | |||
| 43937 | Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1); | |||
| 43938 | if (DemandedOp0 || DemandedOp1) { | |||
| 43939 | SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0); | |||
| 43940 | SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1); | |||
| 43941 | return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1)); | |||
| 43942 | } | |||
| 43943 | } | |||
| 43944 | // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support. | |||
| 43945 | break; | |||
| 43946 | case X86ISD::VBROADCAST: { | |||
| 43947 | SDValue Src = Op.getOperand(0); | |||
| 43948 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43949 | APInt DemandedElts = APInt::getOneBitSet( | |||
| 43950 | SrcVT.isVector() ? SrcVT.getVectorNumElements() : 1, 0); | |||
| 43951 | if (SimplifyDemandedBits(Src, OriginalDemandedBits, DemandedElts, Known, | |||
| 43952 | TLO, Depth + 1)) | |||
| 43953 | return true; | |||
| 43954 | // If we don't need the upper bits, attempt to narrow the broadcast source. | |||
| 43955 | // Don't attempt this on AVX512 as it might affect broadcast folding. | |||
| 43956 | // TODO: Should we attempt this for i32/i16 splats? They tend to be slower. | |||
| 43957 | if ((BitWidth == 64) && SrcVT.isScalarInteger() && !Subtarget.hasAVX512() && | |||
| 43958 | OriginalDemandedBits.countl_zero() >= (BitWidth / 2) && | |||
| 43959 | Src->hasOneUse()) { | |||
| 43960 | MVT NewSrcVT = MVT::getIntegerVT(BitWidth / 2); | |||
| 43961 | SDValue NewSrc = | |||
| 43962 | TLO.DAG.getNode(ISD::TRUNCATE, SDLoc(Src), NewSrcVT, Src); | |||
| 43963 | MVT NewVT = MVT::getVectorVT(NewSrcVT, VT.getVectorNumElements() * 2); | |||
| 43964 | SDValue NewBcst = | |||
| 43965 | TLO.DAG.getNode(X86ISD::VBROADCAST, SDLoc(Op), NewVT, NewSrc); | |||
| 43966 | return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, NewBcst)); | |||
| 43967 | } | |||
| 43968 | break; | |||
| 43969 | } | |||
| 43970 | case X86ISD::PCMPGT: | |||
| 43971 | // icmp sgt(0, R) == ashr(R, BitWidth-1). | |||
| 43972 | // iff we only need the sign bit then we can use R directly. | |||
| 43973 | if (OriginalDemandedBits.isSignMask() && | |||
| 43974 | ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) | |||
| 43975 | return TLO.CombineTo(Op, Op.getOperand(1)); | |||
| 43976 | break; | |||
| 43977 | case X86ISD::MOVMSK: { | |||
| 43978 | SDValue Src = Op.getOperand(0); | |||
| 43979 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 43980 | unsigned SrcBits = SrcVT.getScalarSizeInBits(); | |||
| 43981 | unsigned NumElts = SrcVT.getVectorNumElements(); | |||
| 43982 | ||||
| 43983 | // If we don't need the sign bits at all just return zero. | |||
| 43984 | if (OriginalDemandedBits.countr_zero() >= NumElts) | |||
| 43985 | return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); | |||
| 43986 | ||||
| 43987 | // See if we only demand bits from the lower 128-bit vector. | |||
| 43988 | if (SrcVT.is256BitVector() && | |||
| 43989 | OriginalDemandedBits.getActiveBits() <= (NumElts / 2)) { | |||
| 43990 | SDValue NewSrc = extract128BitVector(Src, 0, TLO.DAG, SDLoc(Src)); | |||
| 43991 | return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); | |||
| 43992 | } | |||
| 43993 | ||||
| 43994 | // Only demand the vector elements of the sign bits we need. | |||
| 43995 | APInt KnownUndef, KnownZero; | |||
| 43996 | APInt DemandedElts = OriginalDemandedBits.zextOrTrunc(NumElts); | |||
| 43997 | if (SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero, | |||
| 43998 | TLO, Depth + 1)) | |||
| 43999 | return true; | |||
| 44000 | ||||
| 44001 | Known.Zero = KnownZero.zext(BitWidth); | |||
| 44002 | Known.Zero.setHighBits(BitWidth - NumElts); | |||
| 44003 | ||||
| 44004 | // MOVMSK only uses the MSB from each vector element. | |||
| 44005 | KnownBits KnownSrc; | |||
| 44006 | APInt DemandedSrcBits = APInt::getSignMask(SrcBits); | |||
| 44007 | if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO, | |||
| 44008 | Depth + 1)) | |||
| 44009 | return true; | |||
| 44010 | ||||
| 44011 | if (KnownSrc.One[SrcBits - 1]) | |||
| 44012 | Known.One.setLowBits(NumElts); | |||
| 44013 | else if (KnownSrc.Zero[SrcBits - 1]) | |||
| 44014 | Known.Zero.setLowBits(NumElts); | |||
| 44015 | ||||
| 44016 | // Attempt to avoid multi-use os if we don't need anything from it. | |||
| 44017 | if (SDValue NewSrc = SimplifyMultipleUseDemandedBits( | |||
| 44018 | Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1)) | |||
| 44019 | return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc)); | |||
| 44020 | return false; | |||
| 44021 | } | |||
| 44022 | case X86ISD::TESTP: { | |||
| 44023 | SDValue Op0 = Op.getOperand(0); | |||
| 44024 | SDValue Op1 = Op.getOperand(1); | |||
| 44025 | MVT OpVT = Op0.getSimpleValueType(); | |||
| 44026 | assert((OpVT.getVectorElementType() == MVT::f32 ||(static_cast <bool> ((OpVT.getVectorElementType() == MVT ::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP" ) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__ __PRETTY_FUNCTION__)) | |||
| 44027 | OpVT.getVectorElementType() == MVT::f64) &&(static_cast <bool> ((OpVT.getVectorElementType() == MVT ::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP" ) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__ __PRETTY_FUNCTION__)) | |||
| 44028 | "Illegal vector type for X86ISD::TESTP")(static_cast <bool> ((OpVT.getVectorElementType() == MVT ::f32 || OpVT.getVectorElementType() == MVT::f64) && "Illegal vector type for X86ISD::TESTP" ) ? void (0) : __assert_fail ("(OpVT.getVectorElementType() == MVT::f32 || OpVT.getVectorElementType() == MVT::f64) && \"Illegal vector type for X86ISD::TESTP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44028, __extension__ __PRETTY_FUNCTION__)); | |||
| 44029 | ||||
| 44030 | // TESTPS/TESTPD only demands the sign bits of ALL the elements. | |||
| 44031 | KnownBits KnownSrc; | |||
| 44032 | APInt SignMask = APInt::getSignMask(OpVT.getScalarSizeInBits()); | |||
| 44033 | bool AssumeSingleUse = (Op0 == Op1) && Op->isOnlyUserOf(Op0.getNode()); | |||
| 44034 | return SimplifyDemandedBits(Op0, SignMask, KnownSrc, TLO, Depth + 1, | |||
| 44035 | AssumeSingleUse) || | |||
| 44036 | SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1, | |||
| 44037 | AssumeSingleUse); | |||
| 44038 | } | |||
| 44039 | case X86ISD::BEXTR: | |||
| 44040 | case X86ISD::BEXTRI: { | |||
| 44041 | SDValue Op0 = Op.getOperand(0); | |||
| 44042 | SDValue Op1 = Op.getOperand(1); | |||
| 44043 | ||||
| 44044 | // Only bottom 16-bits of the control bits are required. | |||
| 44045 | if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) { | |||
| 44046 | // NOTE: SimplifyDemandedBits won't do this for constants. | |||
| 44047 | uint64_t Val1 = Cst1->getZExtValue(); | |||
| 44048 | uint64_t MaskedVal1 = Val1 & 0xFFFF; | |||
| 44049 | if (Opc == X86ISD::BEXTR && MaskedVal1 != Val1) { | |||
| 44050 | SDLoc DL(Op); | |||
| 44051 | return TLO.CombineTo( | |||
| 44052 | Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0, | |||
| 44053 | TLO.DAG.getConstant(MaskedVal1, DL, VT))); | |||
| 44054 | } | |||
| 44055 | ||||
| 44056 | unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0); | |||
| 44057 | unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8); | |||
| 44058 | ||||
| 44059 | // If the length is 0, the result is 0. | |||
| 44060 | if (Length == 0) { | |||
| 44061 | Known.setAllZero(); | |||
| 44062 | return false; | |||
| 44063 | } | |||
| 44064 | ||||
| 44065 | if ((Shift + Length) <= BitWidth) { | |||
| 44066 | APInt DemandedMask = APInt::getBitsSet(BitWidth, Shift, Shift + Length); | |||
| 44067 | if (SimplifyDemandedBits(Op0, DemandedMask, Known, TLO, Depth + 1)) | |||
| 44068 | return true; | |||
| 44069 | ||||
| 44070 | Known = Known.extractBits(Length, Shift); | |||
| 44071 | Known = Known.zextOrTrunc(BitWidth); | |||
| 44072 | return false; | |||
| 44073 | } | |||
| 44074 | } else { | |||
| 44075 | assert(Opc == X86ISD::BEXTR && "Unexpected opcode!")(static_cast <bool> (Opc == X86ISD::BEXTR && "Unexpected opcode!" ) ? void (0) : __assert_fail ("Opc == X86ISD::BEXTR && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44075, __extension__ __PRETTY_FUNCTION__)); | |||
| 44076 | KnownBits Known1; | |||
| 44077 | APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16)); | |||
| 44078 | if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1)) | |||
| 44079 | return true; | |||
| 44080 | ||||
| 44081 | // If the length is 0, replace with 0. | |||
| 44082 | KnownBits LengthBits = Known1.extractBits(8, 8); | |||
| 44083 | if (LengthBits.isZero()) | |||
| 44084 | return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT)); | |||
| 44085 | } | |||
| 44086 | ||||
| 44087 | break; | |||
| 44088 | } | |||
| 44089 | case X86ISD::PDEP: { | |||
| 44090 | SDValue Op0 = Op.getOperand(0); | |||
| 44091 | SDValue Op1 = Op.getOperand(1); | |||
| 44092 | ||||
| 44093 | unsigned DemandedBitsLZ = OriginalDemandedBits.countl_zero(); | |||
| 44094 | APInt LoMask = APInt::getLowBitsSet(BitWidth, BitWidth - DemandedBitsLZ); | |||
| 44095 | ||||
| 44096 | // If the demanded bits has leading zeroes, we don't demand those from the | |||
| 44097 | // mask. | |||
| 44098 | if (SimplifyDemandedBits(Op1, LoMask, Known, TLO, Depth + 1)) | |||
| 44099 | return true; | |||
| 44100 | ||||
| 44101 | // The number of possible 1s in the mask determines the number of LSBs of | |||
| 44102 | // operand 0 used. Undemanded bits from the mask don't matter so filter | |||
| 44103 | // them before counting. | |||
| 44104 | KnownBits Known2; | |||
| 44105 | uint64_t Count = (~Known.Zero & LoMask).popcount(); | |||
| 44106 | APInt DemandedMask(APInt::getLowBitsSet(BitWidth, Count)); | |||
| 44107 | if (SimplifyDemandedBits(Op0, DemandedMask, Known2, TLO, Depth + 1)) | |||
| 44108 | return true; | |||
| 44109 | ||||
| 44110 | // Zeroes are retained from the mask, but not ones. | |||
| 44111 | Known.One.clearAllBits(); | |||
| 44112 | // The result will have at least as many trailing zeros as the non-mask | |||
| 44113 | // operand since bits can only map to the same or higher bit position. | |||
| 44114 | Known.Zero.setLowBits(Known2.countMinTrailingZeros()); | |||
| 44115 | return false; | |||
| 44116 | } | |||
| 44117 | } | |||
| 44118 | ||||
| 44119 | return TargetLowering::SimplifyDemandedBitsForTargetNode( | |||
| 44120 | Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth); | |||
| 44121 | } | |||
| 44122 | ||||
| 44123 | SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( | |||
| 44124 | SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, | |||
| 44125 | SelectionDAG &DAG, unsigned Depth) const { | |||
| 44126 | int NumElts = DemandedElts.getBitWidth(); | |||
| 44127 | unsigned Opc = Op.getOpcode(); | |||
| 44128 | EVT VT = Op.getValueType(); | |||
| 44129 | ||||
| 44130 | switch (Opc) { | |||
| 44131 | case X86ISD::PINSRB: | |||
| 44132 | case X86ISD::PINSRW: { | |||
| 44133 | // If we don't demand the inserted element, return the base vector. | |||
| 44134 | SDValue Vec = Op.getOperand(0); | |||
| 44135 | auto *CIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2)); | |||
| 44136 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 44137 | if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) && | |||
| 44138 | !DemandedElts[CIdx->getZExtValue()]) | |||
| 44139 | return Vec; | |||
| 44140 | break; | |||
| 44141 | } | |||
| 44142 | case X86ISD::VSHLI: { | |||
| 44143 | // If we are only demanding sign bits then we can use the shift source | |||
| 44144 | // directly. | |||
| 44145 | SDValue Op0 = Op.getOperand(0); | |||
| 44146 | unsigned ShAmt = Op.getConstantOperandVal(1); | |||
| 44147 | unsigned BitWidth = DemandedBits.getBitWidth(); | |||
| 44148 | unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1); | |||
| 44149 | unsigned UpperDemandedBits = BitWidth - DemandedBits.countr_zero(); | |||
| 44150 | if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits) | |||
| 44151 | return Op0; | |||
| 44152 | break; | |||
| 44153 | } | |||
| 44154 | case X86ISD::VSRAI: | |||
| 44155 | // iff we only need the sign bit then we can use the source directly. | |||
| 44156 | // TODO: generalize where we only demand extended signbits. | |||
| 44157 | if (DemandedBits.isSignMask()) | |||
| 44158 | return Op.getOperand(0); | |||
| 44159 | break; | |||
| 44160 | case X86ISD::PCMPGT: | |||
| 44161 | // icmp sgt(0, R) == ashr(R, BitWidth-1). | |||
| 44162 | // iff we only need the sign bit then we can use R directly. | |||
| 44163 | if (DemandedBits.isSignMask() && | |||
| 44164 | ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode())) | |||
| 44165 | return Op.getOperand(1); | |||
| 44166 | break; | |||
| 44167 | case X86ISD::ANDNP: { | |||
| 44168 | // ANDNP = (~LHS & RHS); | |||
| 44169 | SDValue LHS = Op.getOperand(0); | |||
| 44170 | SDValue RHS = Op.getOperand(1); | |||
| 44171 | ||||
| 44172 | KnownBits LHSKnown = DAG.computeKnownBits(LHS, DemandedElts, Depth + 1); | |||
| 44173 | KnownBits RHSKnown = DAG.computeKnownBits(RHS, DemandedElts, Depth + 1); | |||
| 44174 | ||||
| 44175 | // If all of the demanded bits are known 0 on LHS and known 0 on RHS, then | |||
| 44176 | // the (inverted) LHS bits cannot contribute to the result of the 'andn' in | |||
| 44177 | // this context, so return RHS. | |||
| 44178 | if (DemandedBits.isSubsetOf(RHSKnown.Zero | LHSKnown.Zero)) | |||
| 44179 | return RHS; | |||
| 44180 | break; | |||
| 44181 | } | |||
| 44182 | } | |||
| 44183 | ||||
| 44184 | APInt ShuffleUndef, ShuffleZero; | |||
| 44185 | SmallVector<int, 16> ShuffleMask; | |||
| 44186 | SmallVector<SDValue, 2> ShuffleOps; | |||
| 44187 | if (getTargetShuffleInputs(Op, DemandedElts, ShuffleOps, ShuffleMask, | |||
| 44188 | ShuffleUndef, ShuffleZero, DAG, Depth, false)) { | |||
| 44189 | // If all the demanded elts are from one operand and are inline, | |||
| 44190 | // then we can use the operand directly. | |||
| 44191 | int NumOps = ShuffleOps.size(); | |||
| 44192 | if (ShuffleMask.size() == (unsigned)NumElts && | |||
| 44193 | llvm::all_of(ShuffleOps, [VT](SDValue V) { | |||
| 44194 | return VT.getSizeInBits() == V.getValueSizeInBits(); | |||
| 44195 | })) { | |||
| 44196 | ||||
| 44197 | if (DemandedElts.isSubsetOf(ShuffleUndef)) | |||
| 44198 | return DAG.getUNDEF(VT); | |||
| 44199 | if (DemandedElts.isSubsetOf(ShuffleUndef | ShuffleZero)) | |||
| 44200 | return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(Op)); | |||
| 44201 | ||||
| 44202 | // Bitmask that indicates which ops have only been accessed 'inline'. | |||
| 44203 | APInt IdentityOp = APInt::getAllOnes(NumOps); | |||
| 44204 | for (int i = 0; i != NumElts; ++i) { | |||
| 44205 | int M = ShuffleMask[i]; | |||
| 44206 | if (!DemandedElts[i] || ShuffleUndef[i]) | |||
| 44207 | continue; | |||
| 44208 | int OpIdx = M / NumElts; | |||
| 44209 | int EltIdx = M % NumElts; | |||
| 44210 | if (M < 0 || EltIdx != i) { | |||
| 44211 | IdentityOp.clearAllBits(); | |||
| 44212 | break; | |||
| 44213 | } | |||
| 44214 | IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx); | |||
| 44215 | if (IdentityOp == 0) | |||
| 44216 | break; | |||
| 44217 | } | |||
| 44218 | assert((IdentityOp == 0 || IdentityOp.popcount() == 1) &&(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount () == 1) && "Multiple identity shuffles detected") ? void (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__ __PRETTY_FUNCTION__)) | |||
| 44219 | "Multiple identity shuffles detected")(static_cast <bool> ((IdentityOp == 0 || IdentityOp.popcount () == 1) && "Multiple identity shuffles detected") ? void (0) : __assert_fail ("(IdentityOp == 0 || IdentityOp.popcount() == 1) && \"Multiple identity shuffles detected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44219, __extension__ __PRETTY_FUNCTION__)); | |||
| 44220 | ||||
| 44221 | if (IdentityOp != 0) | |||
| 44222 | return DAG.getBitcast(VT, ShuffleOps[IdentityOp.countr_zero()]); | |||
| 44223 | } | |||
| 44224 | } | |||
| 44225 | ||||
| 44226 | return TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode( | |||
| 44227 | Op, DemandedBits, DemandedElts, DAG, Depth); | |||
| 44228 | } | |||
| 44229 | ||||
| 44230 | bool X86TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( | |||
| 44231 | SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, | |||
| 44232 | bool PoisonOnly, unsigned Depth) const { | |||
| 44233 | unsigned EltsBits = Op.getScalarValueSizeInBits(); | |||
| 44234 | unsigned NumElts = DemandedElts.getBitWidth(); | |||
| 44235 | ||||
| 44236 | // TODO: Add more target shuffles. | |||
| 44237 | switch (Op.getOpcode()) { | |||
| 44238 | case X86ISD::PSHUFD: | |||
| 44239 | case X86ISD::VPERMILPI: { | |||
| 44240 | SmallVector<int, 8> Mask; | |||
| 44241 | DecodePSHUFMask(NumElts, EltsBits, Op.getConstantOperandVal(1), Mask); | |||
| 44242 | ||||
| 44243 | APInt DemandedSrcElts = APInt::getZero(NumElts); | |||
| 44244 | for (unsigned I = 0; I != NumElts; ++I) | |||
| 44245 | if (DemandedElts[I]) | |||
| 44246 | DemandedSrcElts.setBit(Mask[I]); | |||
| 44247 | ||||
| 44248 | return DAG.isGuaranteedNotToBeUndefOrPoison( | |||
| 44249 | Op.getOperand(0), DemandedSrcElts, PoisonOnly, Depth + 1); | |||
| 44250 | } | |||
| 44251 | } | |||
| 44252 | return TargetLowering::isGuaranteedNotToBeUndefOrPoisonForTargetNode( | |||
| 44253 | Op, DemandedElts, DAG, PoisonOnly, Depth); | |||
| 44254 | } | |||
| 44255 | ||||
| 44256 | bool X86TargetLowering::canCreateUndefOrPoisonForTargetNode( | |||
| 44257 | SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, | |||
| 44258 | bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const { | |||
| 44259 | ||||
| 44260 | // TODO: Add more target shuffles. | |||
| 44261 | switch (Op.getOpcode()) { | |||
| 44262 | case X86ISD::PSHUFD: | |||
| 44263 | case X86ISD::VPERMILPI: | |||
| 44264 | return false; | |||
| 44265 | } | |||
| 44266 | return TargetLowering::canCreateUndefOrPoisonForTargetNode( | |||
| 44267 | Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth); | |||
| 44268 | } | |||
| 44269 | ||||
| 44270 | bool X86TargetLowering::isSplatValueForTargetNode(SDValue Op, | |||
| 44271 | const APInt &DemandedElts, | |||
| 44272 | APInt &UndefElts, | |||
| 44273 | const SelectionDAG &DAG, | |||
| 44274 | unsigned Depth) const { | |||
| 44275 | unsigned NumElts = DemandedElts.getBitWidth(); | |||
| 44276 | unsigned Opc = Op.getOpcode(); | |||
| 44277 | ||||
| 44278 | switch (Opc) { | |||
| 44279 | case X86ISD::VBROADCAST: | |||
| 44280 | case X86ISD::VBROADCAST_LOAD: | |||
| 44281 | UndefElts = APInt::getZero(NumElts); | |||
| 44282 | return true; | |||
| 44283 | } | |||
| 44284 | ||||
| 44285 | return TargetLowering::isSplatValueForTargetNode(Op, DemandedElts, UndefElts, | |||
| 44286 | DAG, Depth); | |||
| 44287 | } | |||
| 44288 | ||||
| 44289 | // Helper to peek through bitops/trunc/setcc to determine size of source vector. | |||
| 44290 | // Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>. | |||
| 44291 | static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size, | |||
| 44292 | bool AllowTruncate) { | |||
| 44293 | switch (Src.getOpcode()) { | |||
| 44294 | case ISD::TRUNCATE: | |||
| 44295 | if (!AllowTruncate) | |||
| 44296 | return false; | |||
| 44297 | [[fallthrough]]; | |||
| 44298 | case ISD::SETCC: | |||
| 44299 | return Src.getOperand(0).getValueSizeInBits() == Size; | |||
| 44300 | case ISD::AND: | |||
| 44301 | case ISD::XOR: | |||
| 44302 | case ISD::OR: | |||
| 44303 | return checkBitcastSrcVectorSize(Src.getOperand(0), Size, AllowTruncate) && | |||
| 44304 | checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate); | |||
| 44305 | case ISD::SELECT: | |||
| 44306 | case ISD::VSELECT: | |||
| 44307 | return Src.getOperand(0).getScalarValueSizeInBits() == 1 && | |||
| 44308 | checkBitcastSrcVectorSize(Src.getOperand(1), Size, AllowTruncate) && | |||
| 44309 | checkBitcastSrcVectorSize(Src.getOperand(2), Size, AllowTruncate); | |||
| 44310 | case ISD::BUILD_VECTOR: | |||
| 44311 | return ISD::isBuildVectorAllZeros(Src.getNode()) || | |||
| 44312 | ISD::isBuildVectorAllOnes(Src.getNode()); | |||
| 44313 | } | |||
| 44314 | return false; | |||
| 44315 | } | |||
| 44316 | ||||
| 44317 | // Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents. | |||
| 44318 | static unsigned getAltBitOpcode(unsigned Opcode) { | |||
| 44319 | switch(Opcode) { | |||
| 44320 | case ISD::AND: return X86ISD::FAND; | |||
| 44321 | case ISD::OR: return X86ISD::FOR; | |||
| 44322 | case ISD::XOR: return X86ISD::FXOR; | |||
| 44323 | case X86ISD::ANDNP: return X86ISD::FANDN; | |||
| 44324 | } | |||
| 44325 | llvm_unreachable("Unknown bitwise opcode")::llvm::llvm_unreachable_internal("Unknown bitwise opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 44325); | |||
| 44326 | } | |||
| 44327 | ||||
| 44328 | // Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets. | |||
| 44329 | static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src, | |||
| 44330 | const SDLoc &DL) { | |||
| 44331 | EVT SrcVT = Src.getValueType(); | |||
| 44332 | if (SrcVT != MVT::v4i1) | |||
| 44333 | return SDValue(); | |||
| 44334 | ||||
| 44335 | switch (Src.getOpcode()) { | |||
| 44336 | case ISD::SETCC: | |||
| 44337 | if (Src.getOperand(0).getValueType() == MVT::v4i32 && | |||
| 44338 | ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) && | |||
| 44339 | cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) { | |||
| 44340 | SDValue Op0 = Src.getOperand(0); | |||
| 44341 | if (ISD::isNormalLoad(Op0.getNode())) | |||
| 44342 | return DAG.getBitcast(MVT::v4f32, Op0); | |||
| 44343 | if (Op0.getOpcode() == ISD::BITCAST && | |||
| 44344 | Op0.getOperand(0).getValueType() == MVT::v4f32) | |||
| 44345 | return Op0.getOperand(0); | |||
| 44346 | } | |||
| 44347 | break; | |||
| 44348 | case ISD::AND: | |||
| 44349 | case ISD::XOR: | |||
| 44350 | case ISD::OR: { | |||
| 44351 | SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL); | |||
| 44352 | SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL); | |||
| 44353 | if (Op0 && Op1) | |||
| 44354 | return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0, | |||
| 44355 | Op1); | |||
| 44356 | break; | |||
| 44357 | } | |||
| 44358 | } | |||
| 44359 | return SDValue(); | |||
| 44360 | } | |||
| 44361 | ||||
| 44362 | // Helper to push sign extension of vXi1 SETCC result through bitops. | |||
| 44363 | static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT, | |||
| 44364 | SDValue Src, const SDLoc &DL) { | |||
| 44365 | switch (Src.getOpcode()) { | |||
| 44366 | case ISD::SETCC: | |||
| 44367 | case ISD::TRUNCATE: | |||
| 44368 | case ISD::BUILD_VECTOR: | |||
| 44369 | return DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); | |||
| 44370 | case ISD::AND: | |||
| 44371 | case ISD::XOR: | |||
| 44372 | case ISD::OR: | |||
| 44373 | return DAG.getNode( | |||
| 44374 | Src.getOpcode(), DL, SExtVT, | |||
| 44375 | signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(0), DL), | |||
| 44376 | signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL)); | |||
| 44377 | case ISD::SELECT: | |||
| 44378 | case ISD::VSELECT: | |||
| 44379 | return DAG.getSelect( | |||
| 44380 | DL, SExtVT, Src.getOperand(0), | |||
| 44381 | signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(1), DL), | |||
| 44382 | signExtendBitcastSrcVector(DAG, SExtVT, Src.getOperand(2), DL)); | |||
| 44383 | } | |||
| 44384 | llvm_unreachable("Unexpected node type for vXi1 sign extension")::llvm::llvm_unreachable_internal("Unexpected node type for vXi1 sign extension" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44384); | |||
| 44385 | } | |||
| 44386 | ||||
| 44387 | // Try to match patterns such as | |||
| 44388 | // (i16 bitcast (v16i1 x)) | |||
| 44389 | // -> | |||
| 44390 | // (i16 movmsk (16i8 sext (v16i1 x))) | |||
| 44391 | // before the illegal vector is scalarized on subtargets that don't have legal | |||
| 44392 | // vxi1 types. | |||
| 44393 | static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src, | |||
| 44394 | const SDLoc &DL, | |||
| 44395 | const X86Subtarget &Subtarget) { | |||
| 44396 | EVT SrcVT = Src.getValueType(); | |||
| 44397 | if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1) | |||
| 44398 | return SDValue(); | |||
| 44399 | ||||
| 44400 | // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type | |||
| 44401 | // legalization destroys the v4i32 type. | |||
| 44402 | if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) { | |||
| 44403 | if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) { | |||
| 44404 | V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, | |||
| 44405 | DAG.getBitcast(MVT::v4f32, V)); | |||
| 44406 | return DAG.getZExtOrTrunc(V, DL, VT); | |||
| 44407 | } | |||
| 44408 | } | |||
| 44409 | ||||
| 44410 | // If the input is a truncate from v16i8 or v32i8 go ahead and use a | |||
| 44411 | // movmskb even with avx512. This will be better than truncating to vXi1 and | |||
| 44412 | // using a kmov. This can especially help KNL if the input is a v16i8/v32i8 | |||
| 44413 | // vpcmpeqb/vpcmpgtb. | |||
| 44414 | bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() && | |||
| 44415 | (Src.getOperand(0).getValueType() == MVT::v16i8 || | |||
| 44416 | Src.getOperand(0).getValueType() == MVT::v32i8 || | |||
| 44417 | Src.getOperand(0).getValueType() == MVT::v64i8); | |||
| 44418 | ||||
| 44419 | // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled | |||
| 44420 | // directly with vpmovmskb/vmovmskps/vmovmskpd. | |||
| 44421 | if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() && | |||
| 44422 | cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT && | |||
| 44423 | ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) { | |||
| 44424 | EVT CmpVT = Src.getOperand(0).getValueType(); | |||
| 44425 | EVT EltVT = CmpVT.getVectorElementType(); | |||
| 44426 | if (CmpVT.getSizeInBits() <= 256 && | |||
| 44427 | (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64)) | |||
| 44428 | PreferMovMsk = true; | |||
| 44429 | } | |||
| 44430 | ||||
| 44431 | // With AVX512 vxi1 types are legal and we prefer using k-regs. | |||
| 44432 | // MOVMSK is supported in SSE2 or later. | |||
| 44433 | if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk)) | |||
| 44434 | return SDValue(); | |||
| 44435 | ||||
| 44436 | // If the upper ops of a concatenation are undef, then try to bitcast the | |||
| 44437 | // lower op and extend. | |||
| 44438 | SmallVector<SDValue, 4> SubSrcOps; | |||
| 44439 | if (collectConcatOps(Src.getNode(), SubSrcOps, DAG) && | |||
| 44440 | SubSrcOps.size() >= 2) { | |||
| 44441 | SDValue LowerOp = SubSrcOps[0]; | |||
| 44442 | ArrayRef<SDValue> UpperOps(std::next(SubSrcOps.begin()), SubSrcOps.end()); | |||
| 44443 | if (LowerOp.getOpcode() == ISD::SETCC && | |||
| 44444 | all_of(UpperOps, [](SDValue Op) { return Op.isUndef(); })) { | |||
| 44445 | EVT SubVT = VT.getIntegerVT( | |||
| 44446 | *DAG.getContext(), LowerOp.getValueType().getVectorMinNumElements()); | |||
| 44447 | if (SDValue V = combineBitcastvxi1(DAG, SubVT, LowerOp, DL, Subtarget)) { | |||
| 44448 | EVT IntVT = VT.getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); | |||
| 44449 | return DAG.getBitcast(VT, DAG.getNode(ISD::ANY_EXTEND, DL, IntVT, V)); | |||
| 44450 | } | |||
| 44451 | } | |||
| 44452 | } | |||
| 44453 | ||||
| 44454 | // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and | |||
| 44455 | // v8f64. So all legal 128-bit and 256-bit vectors are covered except for | |||
| 44456 | // v8i16 and v16i16. | |||
| 44457 | // For these two cases, we can shuffle the upper element bytes to a | |||
| 44458 | // consecutive sequence at the start of the vector and treat the results as | |||
| 44459 | // v16i8 or v32i8, and for v16i8 this is the preferable solution. However, | |||
| 44460 | // for v16i16 this is not the case, because the shuffle is expensive, so we | |||
| 44461 | // avoid sign-extending to this type entirely. | |||
| 44462 | // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: | |||
| 44463 | // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) | |||
| 44464 | MVT SExtVT; | |||
| 44465 | bool PropagateSExt = false; | |||
| 44466 | switch (SrcVT.getSimpleVT().SimpleTy) { | |||
| 44467 | default: | |||
| 44468 | return SDValue(); | |||
| 44469 | case MVT::v2i1: | |||
| 44470 | SExtVT = MVT::v2i64; | |||
| 44471 | break; | |||
| 44472 | case MVT::v4i1: | |||
| 44473 | SExtVT = MVT::v4i32; | |||
| 44474 | // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) | |||
| 44475 | // sign-extend to a 256-bit operation to avoid truncation. | |||
| 44476 | if (Subtarget.hasAVX() && | |||
| 44477 | checkBitcastSrcVectorSize(Src, 256, Subtarget.hasAVX2())) { | |||
| 44478 | SExtVT = MVT::v4i64; | |||
| 44479 | PropagateSExt = true; | |||
| 44480 | } | |||
| 44481 | break; | |||
| 44482 | case MVT::v8i1: | |||
| 44483 | SExtVT = MVT::v8i16; | |||
| 44484 | // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)), | |||
| 44485 | // sign-extend to a 256-bit operation to match the compare. | |||
| 44486 | // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over | |||
| 44487 | // 256-bit because the shuffle is cheaper than sign extending the result of | |||
| 44488 | // the compare. | |||
| 44489 | if (Subtarget.hasAVX() && (checkBitcastSrcVectorSize(Src, 256, true) || | |||
| 44490 | checkBitcastSrcVectorSize(Src, 512, true))) { | |||
| 44491 | SExtVT = MVT::v8i32; | |||
| 44492 | PropagateSExt = true; | |||
| 44493 | } | |||
| 44494 | break; | |||
| 44495 | case MVT::v16i1: | |||
| 44496 | SExtVT = MVT::v16i8; | |||
| 44497 | // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), | |||
| 44498 | // it is not profitable to sign-extend to 256-bit because this will | |||
| 44499 | // require an extra cross-lane shuffle which is more expensive than | |||
| 44500 | // truncating the result of the compare to 128-bits. | |||
| 44501 | break; | |||
| 44502 | case MVT::v32i1: | |||
| 44503 | SExtVT = MVT::v32i8; | |||
| 44504 | break; | |||
| 44505 | case MVT::v64i1: | |||
| 44506 | // If we have AVX512F, but not AVX512BW and the input is truncated from | |||
| 44507 | // v64i8 checked earlier. Then split the input and make two pmovmskbs. | |||
| 44508 | if (Subtarget.hasAVX512()) { | |||
| 44509 | if (Subtarget.hasBWI()) | |||
| 44510 | return SDValue(); | |||
| 44511 | SExtVT = MVT::v64i8; | |||
| 44512 | break; | |||
| 44513 | } | |||
| 44514 | // Split if this is a <64 x i8> comparison result. | |||
| 44515 | if (checkBitcastSrcVectorSize(Src, 512, false)) { | |||
| 44516 | SExtVT = MVT::v64i8; | |||
| 44517 | break; | |||
| 44518 | } | |||
| 44519 | return SDValue(); | |||
| 44520 | }; | |||
| 44521 | ||||
| 44522 | SDValue V = PropagateSExt ? signExtendBitcastSrcVector(DAG, SExtVT, Src, DL) | |||
| 44523 | : DAG.getNode(ISD::SIGN_EXTEND, DL, SExtVT, Src); | |||
| 44524 | ||||
| 44525 | if (SExtVT == MVT::v16i8 || SExtVT == MVT::v32i8 || SExtVT == MVT::v64i8) { | |||
| 44526 | V = getPMOVMSKB(DL, V, DAG, Subtarget); | |||
| 44527 | } else { | |||
| 44528 | if (SExtVT == MVT::v8i16) | |||
| 44529 | V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V, | |||
| 44530 | DAG.getUNDEF(MVT::v8i16)); | |||
| 44531 | V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); | |||
| 44532 | } | |||
| 44533 | ||||
| 44534 | EVT IntVT = | |||
| 44535 | EVT::getIntegerVT(*DAG.getContext(), SrcVT.getVectorNumElements()); | |||
| 44536 | V = DAG.getZExtOrTrunc(V, DL, IntVT); | |||
| 44537 | return DAG.getBitcast(VT, V); | |||
| 44538 | } | |||
| 44539 | ||||
| 44540 | // Convert a vXi1 constant build vector to the same width scalar integer. | |||
| 44541 | static SDValue combinevXi1ConstantToInteger(SDValue Op, SelectionDAG &DAG) { | |||
| 44542 | EVT SrcVT = Op.getValueType(); | |||
| 44543 | assert(SrcVT.getVectorElementType() == MVT::i1 &&(static_cast <bool> (SrcVT.getVectorElementType() == MVT ::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__ __PRETTY_FUNCTION__)) | |||
| 44544 | "Expected a vXi1 vector")(static_cast <bool> (SrcVT.getVectorElementType() == MVT ::i1 && "Expected a vXi1 vector") ? void (0) : __assert_fail ("SrcVT.getVectorElementType() == MVT::i1 && \"Expected a vXi1 vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44544, __extension__ __PRETTY_FUNCTION__)); | |||
| 44545 | assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes (Op.getNode()) && "Expected a constant build vector") ? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__ __PRETTY_FUNCTION__)) | |||
| 44546 | "Expected a constant build vector")(static_cast <bool> (ISD::isBuildVectorOfConstantSDNodes (Op.getNode()) && "Expected a constant build vector") ? void (0) : __assert_fail ("ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) && \"Expected a constant build vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44546, __extension__ __PRETTY_FUNCTION__)); | |||
| 44547 | ||||
| 44548 | APInt Imm(SrcVT.getVectorNumElements(), 0); | |||
| 44549 | for (unsigned Idx = 0, e = Op.getNumOperands(); Idx < e; ++Idx) { | |||
| 44550 | SDValue In = Op.getOperand(Idx); | |||
| 44551 | if (!In.isUndef() && (cast<ConstantSDNode>(In)->getZExtValue() & 0x1)) | |||
| 44552 | Imm.setBit(Idx); | |||
| 44553 | } | |||
| 44554 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), Imm.getBitWidth()); | |||
| 44555 | return DAG.getConstant(Imm, SDLoc(Op), IntVT); | |||
| 44556 | } | |||
| 44557 | ||||
| 44558 | static SDValue combineCastedMaskArithmetic(SDNode *N, SelectionDAG &DAG, | |||
| 44559 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 44560 | const X86Subtarget &Subtarget) { | |||
| 44561 | assert(N->getOpcode() == ISD::BITCAST && "Expected a bitcast")(static_cast <bool> (N->getOpcode() == ISD::BITCAST && "Expected a bitcast") ? void (0) : __assert_fail ("N->getOpcode() == ISD::BITCAST && \"Expected a bitcast\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 44561, __extension__ __PRETTY_FUNCTION__)); | |||
| 44562 | ||||
| 44563 | if (!DCI.isBeforeLegalizeOps()) | |||
| 44564 | return SDValue(); | |||
| 44565 | ||||
| 44566 | // Only do this if we have k-registers. | |||
| 44567 | if (!Subtarget.hasAVX512()) | |||
| 44568 | return SDValue(); | |||
| 44569 | ||||
| 44570 | EVT DstVT = N->getValueType(0); | |||
| 44571 | SDValue Op = N->getOperand(0); | |||
| 44572 | EVT SrcVT = Op.getValueType(); | |||
| 44573 | ||||
| 44574 | if (!Op.hasOneUse()) | |||
| 44575 | return SDValue(); | |||
| 44576 | ||||
| 44577 | // Look for logic ops. | |||
| 44578 | if (Op.getOpcode() != ISD::AND && | |||
| 44579 | Op.getOpcode() != ISD::OR && | |||
| 44580 | Op.getOpcode() != ISD::XOR) | |||
| 44581 | return SDValue(); | |||
| 44582 | ||||
| 44583 | // Make sure we have a bitcast between mask registers and a scalar type. | |||
| 44584 | if (!(SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && | |||
| 44585 | DstVT.isScalarInteger()) && | |||
| 44586 | !(DstVT.isVector() && DstVT.getVectorElementType() == MVT::i1 && | |||
| 44587 | SrcVT.isScalarInteger())) | |||
| 44588 | return SDValue(); | |||
| 44589 | ||||
| 44590 | SDValue LHS = Op.getOperand(0); | |||
| 44591 | SDValue RHS = Op.getOperand(1); | |||
| 44592 | ||||
| 44593 | if (LHS.hasOneUse() && LHS.getOpcode() == ISD::BITCAST && | |||
| 44594 | LHS.getOperand(0).getValueType() == DstVT) | |||
| 44595 | return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, LHS.getOperand(0), | |||
| 44596 | DAG.getBitcast(DstVT, RHS)); | |||
| 44597 | ||||
| 44598 | if (RHS.hasOneUse() && RHS.getOpcode() == ISD::BITCAST && | |||
| 44599 | RHS.getOperand(0).getValueType() == DstVT) | |||
| 44600 | return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, | |||
| 44601 | DAG.getBitcast(DstVT, LHS), RHS.getOperand(0)); | |||
| 44602 | ||||
| 44603 | // If the RHS is a vXi1 build vector, this is a good reason to flip too. | |||
| 44604 | // Most of these have to move a constant from the scalar domain anyway. | |||
| 44605 | if (ISD::isBuildVectorOfConstantSDNodes(RHS.getNode())) { | |||
| 44606 | RHS = combinevXi1ConstantToInteger(RHS, DAG); | |||
| 44607 | return DAG.getNode(Op.getOpcode(), SDLoc(N), DstVT, | |||
| 44608 | DAG.getBitcast(DstVT, LHS), RHS); | |||
| 44609 | } | |||
| 44610 | ||||
| 44611 | return SDValue(); | |||
| 44612 | } | |||
| 44613 | ||||
| 44614 | static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG, | |||
| 44615 | const X86Subtarget &Subtarget) { | |||
| 44616 | SDLoc DL(BV); | |||
| 44617 | unsigned NumElts = BV->getNumOperands(); | |||
| 44618 | SDValue Splat = BV->getSplatValue(); | |||
| 44619 | ||||
| 44620 | // Build MMX element from integer GPR or SSE float values. | |||
| 44621 | auto CreateMMXElement = [&](SDValue V) { | |||
| 44622 | if (V.isUndef()) | |||
| 44623 | return DAG.getUNDEF(MVT::x86mmx); | |||
| 44624 | if (V.getValueType().isFloatingPoint()) { | |||
| 44625 | if (Subtarget.hasSSE1() && !isa<ConstantFPSDNode>(V)) { | |||
| 44626 | V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, V); | |||
| 44627 | V = DAG.getBitcast(MVT::v2i64, V); | |||
| 44628 | return DAG.getNode(X86ISD::MOVDQ2Q, DL, MVT::x86mmx, V); | |||
| 44629 | } | |||
| 44630 | V = DAG.getBitcast(MVT::i32, V); | |||
| 44631 | } else { | |||
| 44632 | V = DAG.getAnyExtOrTrunc(V, DL, MVT::i32); | |||
| 44633 | } | |||
| 44634 | return DAG.getNode(X86ISD::MMX_MOVW2D, DL, MVT::x86mmx, V); | |||
| 44635 | }; | |||
| 44636 | ||||
| 44637 | // Convert build vector ops to MMX data in the bottom elements. | |||
| 44638 | SmallVector<SDValue, 8> Ops; | |||
| 44639 | ||||
| 44640 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 44641 | ||||
| 44642 | // Broadcast - use (PUNPCKL+)PSHUFW to broadcast single element. | |||
| 44643 | if (Splat) { | |||
| 44644 | if (Splat.isUndef()) | |||
| 44645 | return DAG.getUNDEF(MVT::x86mmx); | |||
| 44646 | ||||
| 44647 | Splat = CreateMMXElement(Splat); | |||
| 44648 | ||||
| 44649 | if (Subtarget.hasSSE1()) { | |||
| 44650 | // Unpack v8i8 to splat i8 elements to lowest 16-bits. | |||
| 44651 | if (NumElts == 8) | |||
| 44652 | Splat = DAG.getNode( | |||
| 44653 | ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, | |||
| 44654 | DAG.getTargetConstant(Intrinsic::x86_mmx_punpcklbw, DL, | |||
| 44655 | TLI.getPointerTy(DAG.getDataLayout())), | |||
| 44656 | Splat, Splat); | |||
| 44657 | ||||
| 44658 | // Use PSHUFW to repeat 16-bit elements. | |||
| 44659 | unsigned ShufMask = (NumElts > 2 ? 0 : 0x44); | |||
| 44660 | return DAG.getNode( | |||
| 44661 | ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, | |||
| 44662 | DAG.getTargetConstant(Intrinsic::x86_sse_pshuf_w, DL, | |||
| 44663 | TLI.getPointerTy(DAG.getDataLayout())), | |||
| 44664 | Splat, DAG.getTargetConstant(ShufMask, DL, MVT::i8)); | |||
| 44665 | } | |||
| 44666 | Ops.append(NumElts, Splat); | |||
| 44667 | } else { | |||
| 44668 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 44669 | Ops.push_back(CreateMMXElement(BV->getOperand(i))); | |||
| 44670 | } | |||
| 44671 | ||||
| 44672 | // Use tree of PUNPCKLs to build up general MMX vector. | |||
| 44673 | while (Ops.size() > 1) { | |||
| 44674 | unsigned NumOps = Ops.size(); | |||
| 44675 | unsigned IntrinOp = | |||
| 44676 | (NumOps == 2 ? Intrinsic::x86_mmx_punpckldq | |||
| 44677 | : (NumOps == 4 ? Intrinsic::x86_mmx_punpcklwd | |||
| 44678 | : Intrinsic::x86_mmx_punpcklbw)); | |||
| 44679 | SDValue Intrin = DAG.getTargetConstant( | |||
| 44680 | IntrinOp, DL, TLI.getPointerTy(DAG.getDataLayout())); | |||
| 44681 | for (unsigned i = 0; i != NumOps; i += 2) | |||
| 44682 | Ops[i / 2] = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::x86mmx, Intrin, | |||
| 44683 | Ops[i], Ops[i + 1]); | |||
| 44684 | Ops.resize(NumOps / 2); | |||
| 44685 | } | |||
| 44686 | ||||
| 44687 | return Ops[0]; | |||
| 44688 | } | |||
| 44689 | ||||
| 44690 | // Recursive function that attempts to find if a bool vector node was originally | |||
| 44691 | // a vector/float/double that got truncated/extended/bitcast to/from a scalar | |||
| 44692 | // integer. If so, replace the scalar ops with bool vector equivalents back down | |||
| 44693 | // the chain. | |||
| 44694 | static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, const SDLoc &DL, | |||
| 44695 | SelectionDAG &DAG, | |||
| 44696 | const X86Subtarget &Subtarget) { | |||
| 44697 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 44698 | unsigned Opc = V.getOpcode(); | |||
| 44699 | switch (Opc) { | |||
| 44700 | case ISD::BITCAST: { | |||
| 44701 | // Bitcast from a vector/float/double, we can cheaply bitcast to VT. | |||
| 44702 | SDValue Src = V.getOperand(0); | |||
| 44703 | EVT SrcVT = Src.getValueType(); | |||
| 44704 | if (SrcVT.isVector() || SrcVT.isFloatingPoint()) | |||
| 44705 | return DAG.getBitcast(VT, Src); | |||
| 44706 | break; | |||
| 44707 | } | |||
| 44708 | case ISD::TRUNCATE: { | |||
| 44709 | // If we find a suitable source, a truncated scalar becomes a subvector. | |||
| 44710 | SDValue Src = V.getOperand(0); | |||
| 44711 | EVT NewSrcVT = | |||
| 44712 | EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits()); | |||
| 44713 | if (TLI.isTypeLegal(NewSrcVT)) | |||
| 44714 | if (SDValue N0 = | |||
| 44715 | combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget)) | |||
| 44716 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0, | |||
| 44717 | DAG.getIntPtrConstant(0, DL)); | |||
| 44718 | break; | |||
| 44719 | } | |||
| 44720 | case ISD::ANY_EXTEND: | |||
| 44721 | case ISD::ZERO_EXTEND: { | |||
| 44722 | // If we find a suitable source, an extended scalar becomes a subvector. | |||
| 44723 | SDValue Src = V.getOperand(0); | |||
| 44724 | EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, | |||
| 44725 | Src.getScalarValueSizeInBits()); | |||
| 44726 | if (TLI.isTypeLegal(NewSrcVT)) | |||
| 44727 | if (SDValue N0 = | |||
| 44728 | combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget)) | |||
| 44729 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, | |||
| 44730 | Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT) | |||
| 44731 | : DAG.getConstant(0, DL, VT), | |||
| 44732 | N0, DAG.getIntPtrConstant(0, DL)); | |||
| 44733 | break; | |||
| 44734 | } | |||
| 44735 | case ISD::OR: { | |||
| 44736 | // If we find suitable sources, we can just move an OR to the vector domain. | |||
| 44737 | SDValue Src0 = V.getOperand(0); | |||
| 44738 | SDValue Src1 = V.getOperand(1); | |||
| 44739 | if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) | |||
| 44740 | if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget)) | |||
| 44741 | return DAG.getNode(Opc, DL, VT, N0, N1); | |||
| 44742 | break; | |||
| 44743 | } | |||
| 44744 | case ISD::SHL: { | |||
| 44745 | // If we find a suitable source, a SHL becomes a KSHIFTL. | |||
| 44746 | SDValue Src0 = V.getOperand(0); | |||
| 44747 | if ((VT == MVT::v8i1 && !Subtarget.hasDQI()) || | |||
| 44748 | ((VT == MVT::v32i1 || VT == MVT::v64i1) && !Subtarget.hasBWI())) | |||
| 44749 | break; | |||
| 44750 | ||||
| 44751 | if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1))) | |||
| 44752 | if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget)) | |||
| 44753 | return DAG.getNode( | |||
| 44754 | X86ISD::KSHIFTL, DL, VT, N0, | |||
| 44755 | DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8)); | |||
| 44756 | break; | |||
| 44757 | } | |||
| 44758 | } | |||
| 44759 | return SDValue(); | |||
| 44760 | } | |||
| 44761 | ||||
| 44762 | static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, | |||
| 44763 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 44764 | const X86Subtarget &Subtarget) { | |||
| 44765 | SDValue N0 = N->getOperand(0); | |||
| 44766 | EVT VT = N->getValueType(0); | |||
| 44767 | EVT SrcVT = N0.getValueType(); | |||
| 44768 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 44769 | ||||
| 44770 | // Try to match patterns such as | |||
| 44771 | // (i16 bitcast (v16i1 x)) | |||
| 44772 | // -> | |||
| 44773 | // (i16 movmsk (16i8 sext (v16i1 x))) | |||
| 44774 | // before the setcc result is scalarized on subtargets that don't have legal | |||
| 44775 | // vxi1 types. | |||
| 44776 | if (DCI.isBeforeLegalize()) { | |||
| 44777 | SDLoc dl(N); | |||
| 44778 | if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget)) | |||
| 44779 | return V; | |||
| 44780 | ||||
| 44781 | // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer | |||
| 44782 | // type, widen both sides to avoid a trip through memory. | |||
| 44783 | if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() && | |||
| 44784 | Subtarget.hasAVX512()) { | |||
| 44785 | N0 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i8, N0); | |||
| 44786 | N0 = DAG.getBitcast(MVT::v8i1, N0); | |||
| 44787 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, N0, | |||
| 44788 | DAG.getIntPtrConstant(0, dl)); | |||
| 44789 | } | |||
| 44790 | ||||
| 44791 | // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer | |||
| 44792 | // type, widen both sides to avoid a trip through memory. | |||
| 44793 | if ((SrcVT == MVT::v4i1 || SrcVT == MVT::v2i1) && VT.isScalarInteger() && | |||
| 44794 | Subtarget.hasAVX512()) { | |||
| 44795 | // Use zeros for the widening if we already have some zeroes. This can | |||
| 44796 | // allow SimplifyDemandedBits to remove scalar ANDs that may be down | |||
| 44797 | // stream of this. | |||
| 44798 | // FIXME: It might make sense to detect a concat_vectors with a mix of | |||
| 44799 | // zeroes and undef and turn it into insert_subvector for i1 vectors as | |||
| 44800 | // a separate combine. What we can't do is canonicalize the operands of | |||
| 44801 | // such a concat or we'll get into a loop with SimplifyDemandedBits. | |||
| 44802 | if (N0.getOpcode() == ISD::CONCAT_VECTORS) { | |||
| 44803 | SDValue LastOp = N0.getOperand(N0.getNumOperands() - 1); | |||
| 44804 | if (ISD::isBuildVectorAllZeros(LastOp.getNode())) { | |||
| 44805 | SrcVT = LastOp.getValueType(); | |||
| 44806 | unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); | |||
| 44807 | SmallVector<SDValue, 4> Ops(N0->op_begin(), N0->op_end()); | |||
| 44808 | Ops.resize(NumConcats, DAG.getConstant(0, dl, SrcVT)); | |||
| 44809 | N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); | |||
| 44810 | N0 = DAG.getBitcast(MVT::i8, N0); | |||
| 44811 | return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); | |||
| 44812 | } | |||
| 44813 | } | |||
| 44814 | ||||
| 44815 | unsigned NumConcats = 8 / SrcVT.getVectorNumElements(); | |||
| 44816 | SmallVector<SDValue, 4> Ops(NumConcats, DAG.getUNDEF(SrcVT)); | |||
| 44817 | Ops[0] = N0; | |||
| 44818 | N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); | |||
| 44819 | N0 = DAG.getBitcast(MVT::i8, N0); | |||
| 44820 | return DAG.getNode(ISD::TRUNCATE, dl, VT, N0); | |||
| 44821 | } | |||
| 44822 | } else { | |||
| 44823 | // If we're bitcasting from iX to vXi1, see if the integer originally | |||
| 44824 | // began as a vXi1 and whether we can remove the bitcast entirely. | |||
| 44825 | if (VT.isVector() && VT.getScalarType() == MVT::i1 && | |||
| 44826 | SrcVT.isScalarInteger() && TLI.isTypeLegal(VT)) { | |||
| 44827 | if (SDValue V = | |||
| 44828 | combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget)) | |||
| 44829 | return V; | |||
| 44830 | } | |||
| 44831 | } | |||
| 44832 | ||||
| 44833 | // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and | |||
| 44834 | // replace with (i8 (trunc (i16 (bitcast (v16i1 X))))). This can occur | |||
| 44835 | // due to insert_subvector legalization on KNL. By promoting the copy to i16 | |||
| 44836 | // we can help with known bits propagation from the vXi1 domain to the | |||
| 44837 | // scalar domain. | |||
| 44838 | if (VT == MVT::i8 && SrcVT == MVT::v8i1 && Subtarget.hasAVX512() && | |||
| 44839 | !Subtarget.hasDQI() && N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 44840 | N0.getOperand(0).getValueType() == MVT::v16i1 && | |||
| 44841 | isNullConstant(N0.getOperand(1))) | |||
| 44842 | return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, | |||
| 44843 | DAG.getBitcast(MVT::i16, N0.getOperand(0))); | |||
| 44844 | ||||
| 44845 | // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast | |||
| 44846 | // and the vbroadcast_load are both integer or both fp. In some cases this | |||
| 44847 | // will remove the bitcast entirely. | |||
| 44848 | if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && | |||
| 44849 | VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) { | |||
| 44850 | auto *BCast = cast<MemIntrinsicSDNode>(N0); | |||
| 44851 | unsigned SrcVTSize = SrcVT.getScalarSizeInBits(); | |||
| 44852 | unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits(); | |||
| 44853 | // Don't swap i8/i16 since don't have fp types that size. | |||
| 44854 | if (MemSize >= 32) { | |||
| 44855 | MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize) | |||
| 44856 | : MVT::getIntegerVT(MemSize); | |||
| 44857 | MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize) | |||
| 44858 | : MVT::getIntegerVT(SrcVTSize); | |||
| 44859 | LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements()); | |||
| 44860 | ||||
| 44861 | SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other); | |||
| 44862 | SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; | |||
| 44863 | SDValue ResNode = | |||
| 44864 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, | |||
| 44865 | MemVT, BCast->getMemOperand()); | |||
| 44866 | DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); | |||
| 44867 | return DAG.getBitcast(VT, ResNode); | |||
| 44868 | } | |||
| 44869 | } | |||
| 44870 | ||||
| 44871 | // Since MMX types are special and don't usually play with other vector types, | |||
| 44872 | // it's better to handle them early to be sure we emit efficient code by | |||
| 44873 | // avoiding store-load conversions. | |||
| 44874 | if (VT == MVT::x86mmx) { | |||
| 44875 | // Detect MMX constant vectors. | |||
| 44876 | APInt UndefElts; | |||
| 44877 | SmallVector<APInt, 1> EltBits; | |||
| 44878 | if (getTargetConstantBitsFromNode(N0, 64, UndefElts, EltBits)) { | |||
| 44879 | SDLoc DL(N0); | |||
| 44880 | // Handle zero-extension of i32 with MOVD. | |||
| 44881 | if (EltBits[0].countl_zero() >= 32) | |||
| 44882 | return DAG.getNode(X86ISD::MMX_MOVW2D, DL, VT, | |||
| 44883 | DAG.getConstant(EltBits[0].trunc(32), DL, MVT::i32)); | |||
| 44884 | // Else, bitcast to a double. | |||
| 44885 | // TODO - investigate supporting sext 32-bit immediates on x86_64. | |||
| 44886 | APFloat F64(APFloat::IEEEdouble(), EltBits[0]); | |||
| 44887 | return DAG.getBitcast(VT, DAG.getConstantFP(F64, DL, MVT::f64)); | |||
| 44888 | } | |||
| 44889 | ||||
| 44890 | // Detect bitcasts to x86mmx low word. | |||
| 44891 | if (N0.getOpcode() == ISD::BUILD_VECTOR && | |||
| 44892 | (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) && | |||
| 44893 | N0.getOperand(0).getValueType() == SrcVT.getScalarType()) { | |||
| 44894 | bool LowUndef = true, AllUndefOrZero = true; | |||
| 44895 | for (unsigned i = 1, e = SrcVT.getVectorNumElements(); i != e; ++i) { | |||
| 44896 | SDValue Op = N0.getOperand(i); | |||
| 44897 | LowUndef &= Op.isUndef() || (i >= e/2); | |||
| 44898 | AllUndefOrZero &= (Op.isUndef() || isNullConstant(Op)); | |||
| 44899 | } | |||
| 44900 | if (AllUndefOrZero) { | |||
| 44901 | SDValue N00 = N0.getOperand(0); | |||
| 44902 | SDLoc dl(N00); | |||
| 44903 | N00 = LowUndef ? DAG.getAnyExtOrTrunc(N00, dl, MVT::i32) | |||
| 44904 | : DAG.getZExtOrTrunc(N00, dl, MVT::i32); | |||
| 44905 | return DAG.getNode(X86ISD::MMX_MOVW2D, dl, VT, N00); | |||
| 44906 | } | |||
| 44907 | } | |||
| 44908 | ||||
| 44909 | // Detect bitcasts of 64-bit build vectors and convert to a | |||
| 44910 | // MMX UNPCK/PSHUFW which takes MMX type inputs with the value in the | |||
| 44911 | // lowest element. | |||
| 44912 | if (N0.getOpcode() == ISD::BUILD_VECTOR && | |||
| 44913 | (SrcVT == MVT::v2f32 || SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || | |||
| 44914 | SrcVT == MVT::v8i8)) | |||
| 44915 | return createMMXBuildVector(cast<BuildVectorSDNode>(N0), DAG, Subtarget); | |||
| 44916 | ||||
| 44917 | // Detect bitcasts between element or subvector extraction to x86mmx. | |||
| 44918 | if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT || | |||
| 44919 | N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) && | |||
| 44920 | isNullConstant(N0.getOperand(1))) { | |||
| 44921 | SDValue N00 = N0.getOperand(0); | |||
| 44922 | if (N00.getValueType().is128BitVector()) | |||
| 44923 | return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT, | |||
| 44924 | DAG.getBitcast(MVT::v2i64, N00)); | |||
| 44925 | } | |||
| 44926 | ||||
| 44927 | // Detect bitcasts from FP_TO_SINT to x86mmx. | |||
| 44928 | if (SrcVT == MVT::v2i32 && N0.getOpcode() == ISD::FP_TO_SINT) { | |||
| 44929 | SDLoc DL(N0); | |||
| 44930 | SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0, | |||
| 44931 | DAG.getUNDEF(MVT::v2i32)); | |||
| 44932 | return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT, | |||
| 44933 | DAG.getBitcast(MVT::v2i64, Res)); | |||
| 44934 | } | |||
| 44935 | } | |||
| 44936 | ||||
| 44937 | // Try to remove a bitcast of constant vXi1 vector. We have to legalize | |||
| 44938 | // most of these to scalar anyway. | |||
| 44939 | if (Subtarget.hasAVX512() && VT.isScalarInteger() && | |||
| 44940 | SrcVT.isVector() && SrcVT.getVectorElementType() == MVT::i1 && | |||
| 44941 | ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) { | |||
| 44942 | return combinevXi1ConstantToInteger(N0, DAG); | |||
| 44943 | } | |||
| 44944 | ||||
| 44945 | if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && | |||
| 44946 | VT.isVector() && VT.getVectorElementType() == MVT::i1 && | |||
| 44947 | isa<ConstantSDNode>(N0)) { | |||
| 44948 | auto *C = cast<ConstantSDNode>(N0); | |||
| 44949 | if (C->isAllOnes()) | |||
| 44950 | return DAG.getConstant(1, SDLoc(N0), VT); | |||
| 44951 | if (C->isZero()) | |||
| 44952 | return DAG.getConstant(0, SDLoc(N0), VT); | |||
| 44953 | } | |||
| 44954 | ||||
| 44955 | // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1. | |||
| 44956 | // Turn it into a sign bit compare that produces a k-register. This avoids | |||
| 44957 | // a trip through a GPR. | |||
| 44958 | if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() && | |||
| 44959 | VT.isVector() && VT.getVectorElementType() == MVT::i1 && | |||
| 44960 | isPowerOf2_32(VT.getVectorNumElements())) { | |||
| 44961 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 44962 | SDValue Src = N0; | |||
| 44963 | ||||
| 44964 | // Peek through truncate. | |||
| 44965 | if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse()) | |||
| 44966 | Src = N0.getOperand(0); | |||
| 44967 | ||||
| 44968 | if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) { | |||
| 44969 | SDValue MovmskIn = Src.getOperand(0); | |||
| 44970 | MVT MovmskVT = MovmskIn.getSimpleValueType(); | |||
| 44971 | unsigned MovMskElts = MovmskVT.getVectorNumElements(); | |||
| 44972 | ||||
| 44973 | // We allow extra bits of the movmsk to be used since they are known zero. | |||
| 44974 | // We can't convert a VPMOVMSKB without avx512bw. | |||
| 44975 | if (MovMskElts <= NumElts && | |||
| 44976 | (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) { | |||
| 44977 | EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger(); | |||
| 44978 | MovmskIn = DAG.getBitcast(IntVT, MovmskIn); | |||
| 44979 | SDLoc dl(N); | |||
| 44980 | MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts); | |||
| 44981 | SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn, | |||
| 44982 | DAG.getConstant(0, dl, IntVT), ISD::SETLT); | |||
| 44983 | if (EVT(CmpVT) == VT) | |||
| 44984 | return Cmp; | |||
| 44985 | ||||
| 44986 | // Pad with zeroes up to original VT to replace the zeroes that were | |||
| 44987 | // being used from the MOVMSK. | |||
| 44988 | unsigned NumConcats = NumElts / MovMskElts; | |||
| 44989 | SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT)); | |||
| 44990 | Ops[0] = Cmp; | |||
| 44991 | return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops); | |||
| 44992 | } | |||
| 44993 | } | |||
| 44994 | } | |||
| 44995 | ||||
| 44996 | // Try to remove bitcasts from input and output of mask arithmetic to | |||
| 44997 | // remove GPR<->K-register crossings. | |||
| 44998 | if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget)) | |||
| 44999 | return V; | |||
| 45000 | ||||
| 45001 | // Convert a bitcasted integer logic operation that has one bitcasted | |||
| 45002 | // floating-point operand into a floating-point logic operation. This may | |||
| 45003 | // create a load of a constant, but that is cheaper than materializing the | |||
| 45004 | // constant in an integer register and transferring it to an SSE register or | |||
| 45005 | // transferring the SSE operand to integer register and back. | |||
| 45006 | unsigned FPOpcode; | |||
| 45007 | switch (N0.getOpcode()) { | |||
| 45008 | case ISD::AND: FPOpcode = X86ISD::FAND; break; | |||
| 45009 | case ISD::OR: FPOpcode = X86ISD::FOR; break; | |||
| 45010 | case ISD::XOR: FPOpcode = X86ISD::FXOR; break; | |||
| 45011 | default: return SDValue(); | |||
| 45012 | } | |||
| 45013 | ||||
| 45014 | // Check if we have a bitcast from another integer type as well. | |||
| 45015 | if (!((Subtarget.hasSSE1() && VT == MVT::f32) || | |||
| 45016 | (Subtarget.hasSSE2() && VT == MVT::f64) || | |||
| 45017 | (Subtarget.hasFP16() && VT == MVT::f16) || | |||
| 45018 | (Subtarget.hasSSE2() && VT.isInteger() && VT.isVector() && | |||
| 45019 | TLI.isTypeLegal(VT)))) | |||
| 45020 | return SDValue(); | |||
| 45021 | ||||
| 45022 | SDValue LogicOp0 = N0.getOperand(0); | |||
| 45023 | SDValue LogicOp1 = N0.getOperand(1); | |||
| 45024 | SDLoc DL0(N0); | |||
| 45025 | ||||
| 45026 | // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y)) | |||
| 45027 | if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST && | |||
| 45028 | LogicOp0.hasOneUse() && LogicOp0.getOperand(0).hasOneUse() && | |||
| 45029 | LogicOp0.getOperand(0).getValueType() == VT && | |||
| 45030 | !isa<ConstantSDNode>(LogicOp0.getOperand(0))) { | |||
| 45031 | SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1); | |||
| 45032 | unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode(); | |||
| 45033 | return DAG.getNode(Opcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1); | |||
| 45034 | } | |||
| 45035 | // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y) | |||
| 45036 | if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST && | |||
| 45037 | LogicOp1.hasOneUse() && LogicOp1.getOperand(0).hasOneUse() && | |||
| 45038 | LogicOp1.getOperand(0).getValueType() == VT && | |||
| 45039 | !isa<ConstantSDNode>(LogicOp1.getOperand(0))) { | |||
| 45040 | SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0); | |||
| 45041 | unsigned Opcode = VT.isFloatingPoint() ? FPOpcode : N0.getOpcode(); | |||
| 45042 | return DAG.getNode(Opcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0); | |||
| 45043 | } | |||
| 45044 | ||||
| 45045 | return SDValue(); | |||
| 45046 | } | |||
| 45047 | ||||
| 45048 | // (mul (zext a), (sext, b)) | |||
| 45049 | static bool detectExtMul(SelectionDAG &DAG, const SDValue &Mul, SDValue &Op0, | |||
| 45050 | SDValue &Op1) { | |||
| 45051 | Op0 = Mul.getOperand(0); | |||
| 45052 | Op1 = Mul.getOperand(1); | |||
| 45053 | ||||
| 45054 | // The operand1 should be signed extend | |||
| 45055 | if (Op0.getOpcode() == ISD::SIGN_EXTEND) | |||
| 45056 | std::swap(Op0, Op1); | |||
| 45057 | ||||
| 45058 | auto IsFreeTruncation = [](SDValue &Op) -> bool { | |||
| 45059 | if ((Op.getOpcode() == ISD::ZERO_EXTEND || | |||
| 45060 | Op.getOpcode() == ISD::SIGN_EXTEND) && | |||
| 45061 | Op.getOperand(0).getScalarValueSizeInBits() <= 8) | |||
| 45062 | return true; | |||
| 45063 | ||||
| 45064 | auto *BV = dyn_cast<BuildVectorSDNode>(Op); | |||
| 45065 | return (BV && BV->isConstant()); | |||
| 45066 | }; | |||
| 45067 | ||||
| 45068 | // (dpbusd (zext a), (sext, b)). Since the first operand should be unsigned | |||
| 45069 | // value, we need to check Op0 is zero extended value. Op1 should be signed | |||
| 45070 | // value, so we just check the signed bits. | |||
| 45071 | if ((IsFreeTruncation(Op0) && | |||
| 45072 | DAG.computeKnownBits(Op0).countMaxActiveBits() <= 8) && | |||
| 45073 | (IsFreeTruncation(Op1) && DAG.ComputeMaxSignificantBits(Op1) <= 8)) | |||
| 45074 | return true; | |||
| 45075 | ||||
| 45076 | return false; | |||
| 45077 | } | |||
| 45078 | ||||
| 45079 | // Given a ABS node, detect the following pattern: | |||
| 45080 | // (ABS (SUB (ZERO_EXTEND a), (ZERO_EXTEND b))). | |||
| 45081 | // This is useful as it is the input into a SAD pattern. | |||
| 45082 | static bool detectZextAbsDiff(const SDValue &Abs, SDValue &Op0, SDValue &Op1) { | |||
| 45083 | SDValue AbsOp1 = Abs->getOperand(0); | |||
| 45084 | if (AbsOp1.getOpcode() != ISD::SUB) | |||
| 45085 | return false; | |||
| 45086 | ||||
| 45087 | Op0 = AbsOp1.getOperand(0); | |||
| 45088 | Op1 = AbsOp1.getOperand(1); | |||
| 45089 | ||||
| 45090 | // Check if the operands of the sub are zero-extended from vectors of i8. | |||
| 45091 | if (Op0.getOpcode() != ISD::ZERO_EXTEND || | |||
| 45092 | Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 || | |||
| 45093 | Op1.getOpcode() != ISD::ZERO_EXTEND || | |||
| 45094 | Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8) | |||
| 45095 | return false; | |||
| 45096 | ||||
| 45097 | return true; | |||
| 45098 | } | |||
| 45099 | ||||
| 45100 | static SDValue createVPDPBUSD(SelectionDAG &DAG, SDValue LHS, SDValue RHS, | |||
| 45101 | unsigned &LogBias, const SDLoc &DL, | |||
| 45102 | const X86Subtarget &Subtarget) { | |||
| 45103 | // Extend or truncate to MVT::i8 first. | |||
| 45104 | MVT Vi8VT = | |||
| 45105 | MVT::getVectorVT(MVT::i8, LHS.getValueType().getVectorElementCount()); | |||
| 45106 | LHS = DAG.getZExtOrTrunc(LHS, DL, Vi8VT); | |||
| 45107 | RHS = DAG.getSExtOrTrunc(RHS, DL, Vi8VT); | |||
| 45108 | ||||
| 45109 | // VPDPBUSD(<16 x i32>C, <16 x i8>A, <16 x i8>B). For each dst element | |||
| 45110 | // C[0] = C[0] + A[0]B[0] + A[1]B[1] + A[2]B[2] + A[3]B[3]. | |||
| 45111 | // The src A, B element type is i8, but the dst C element type is i32. | |||
| 45112 | // When we calculate the reduce stage, we use src vector type vXi8 for it | |||
| 45113 | // so we need logbias 2 to avoid extra 2 stages. | |||
| 45114 | LogBias = 2; | |||
| 45115 | ||||
| 45116 | unsigned RegSize = std::max(128u, (unsigned)Vi8VT.getSizeInBits()); | |||
| 45117 | if (Subtarget.hasVNNI() && !Subtarget.hasVLX()) | |||
| 45118 | RegSize = std::max(512u, RegSize); | |||
| 45119 | ||||
| 45120 | // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we | |||
| 45121 | // fill in the missing vector elements with 0. | |||
| 45122 | unsigned NumConcat = RegSize / Vi8VT.getSizeInBits(); | |||
| 45123 | SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, Vi8VT)); | |||
| 45124 | Ops[0] = LHS; | |||
| 45125 | MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); | |||
| 45126 | SDValue DpOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); | |||
| 45127 | Ops[0] = RHS; | |||
| 45128 | SDValue DpOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); | |||
| 45129 | ||||
| 45130 | // Actually build the DotProduct, split as 256/512 bits for | |||
| 45131 | // AVXVNNI/AVX512VNNI. | |||
| 45132 | auto DpBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 45133 | ArrayRef<SDValue> Ops) { | |||
| 45134 | MVT VT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); | |||
| 45135 | return DAG.getNode(X86ISD::VPDPBUSD, DL, VT, Ops); | |||
| 45136 | }; | |||
| 45137 | MVT DpVT = MVT::getVectorVT(MVT::i32, RegSize / 32); | |||
| 45138 | SDValue Zero = DAG.getConstant(0, DL, DpVT); | |||
| 45139 | ||||
| 45140 | return SplitOpsAndApply(DAG, Subtarget, DL, DpVT, {Zero, DpOp0, DpOp1}, | |||
| 45141 | DpBuilder, false); | |||
| 45142 | } | |||
| 45143 | ||||
| 45144 | // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs | |||
| 45145 | // to these zexts. | |||
| 45146 | static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, | |||
| 45147 | const SDValue &Zext1, const SDLoc &DL, | |||
| 45148 | const X86Subtarget &Subtarget) { | |||
| 45149 | // Find the appropriate width for the PSADBW. | |||
| 45150 | EVT InVT = Zext0.getOperand(0).getValueType(); | |||
| 45151 | unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits()); | |||
| 45152 | ||||
| 45153 | // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we | |||
| 45154 | // fill in the missing vector elements with 0. | |||
| 45155 | unsigned NumConcat = RegSize / InVT.getSizeInBits(); | |||
| 45156 | SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT)); | |||
| 45157 | Ops[0] = Zext0.getOperand(0); | |||
| 45158 | MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8); | |||
| 45159 | SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); | |||
| 45160 | Ops[0] = Zext1.getOperand(0); | |||
| 45161 | SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops); | |||
| 45162 | ||||
| 45163 | // Actually build the SAD, split as 128/256/512 bits for SSE/AVX2/AVX512BW. | |||
| 45164 | auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 45165 | ArrayRef<SDValue> Ops) { | |||
| 45166 | MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); | |||
| 45167 | return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops); | |||
| 45168 | }; | |||
| 45169 | MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64); | |||
| 45170 | return SplitOpsAndApply(DAG, Subtarget, DL, SadVT, { SadOp0, SadOp1 }, | |||
| 45171 | PSADBWBuilder); | |||
| 45172 | } | |||
| 45173 | ||||
| 45174 | // Attempt to replace an min/max v8i16/v16i8 horizontal reduction with | |||
| 45175 | // PHMINPOSUW. | |||
| 45176 | static SDValue combineMinMaxReduction(SDNode *Extract, SelectionDAG &DAG, | |||
| 45177 | const X86Subtarget &Subtarget) { | |||
| 45178 | // Bail without SSE41. | |||
| 45179 | if (!Subtarget.hasSSE41()) | |||
| 45180 | return SDValue(); | |||
| 45181 | ||||
| 45182 | EVT ExtractVT = Extract->getValueType(0); | |||
| 45183 | if (ExtractVT != MVT::i16 && ExtractVT != MVT::i8) | |||
| 45184 | return SDValue(); | |||
| 45185 | ||||
| 45186 | // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. | |||
| 45187 | ISD::NodeType BinOp; | |||
| 45188 | SDValue Src = DAG.matchBinOpReduction( | |||
| 45189 | Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}, true); | |||
| 45190 | if (!Src) | |||
| 45191 | return SDValue(); | |||
| 45192 | ||||
| 45193 | EVT SrcVT = Src.getValueType(); | |||
| 45194 | EVT SrcSVT = SrcVT.getScalarType(); | |||
| 45195 | if (SrcSVT != ExtractVT || (SrcVT.getSizeInBits() % 128) != 0) | |||
| 45196 | return SDValue(); | |||
| 45197 | ||||
| 45198 | SDLoc DL(Extract); | |||
| 45199 | SDValue MinPos = Src; | |||
| 45200 | ||||
| 45201 | // First, reduce the source down to 128-bit, applying BinOp to lo/hi. | |||
| 45202 | while (SrcVT.getSizeInBits() > 128) { | |||
| 45203 | SDValue Lo, Hi; | |||
| 45204 | std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL); | |||
| 45205 | SrcVT = Lo.getValueType(); | |||
| 45206 | MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); | |||
| 45207 | } | |||
| 45208 | assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail ("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__ __PRETTY_FUNCTION__)) | |||
| 45209 | (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) &&(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail ("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__ __PRETTY_FUNCTION__)) | |||
| 45210 | "Unexpected value type")(static_cast <bool> (((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && "Unexpected value type") ? void (0) : __assert_fail ("((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) || (SrcVT == MVT::v16i8 && ExtractVT == MVT::i8)) && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45210, __extension__ __PRETTY_FUNCTION__)); | |||
| 45211 | ||||
| 45212 | // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask | |||
| 45213 | // to flip the value accordingly. | |||
| 45214 | SDValue Mask; | |||
| 45215 | unsigned MaskEltsBits = ExtractVT.getSizeInBits(); | |||
| 45216 | if (BinOp == ISD::SMAX) | |||
| 45217 | Mask = DAG.getConstant(APInt::getSignedMaxValue(MaskEltsBits), DL, SrcVT); | |||
| 45218 | else if (BinOp == ISD::SMIN) | |||
| 45219 | Mask = DAG.getConstant(APInt::getSignedMinValue(MaskEltsBits), DL, SrcVT); | |||
| 45220 | else if (BinOp == ISD::UMAX) | |||
| 45221 | Mask = DAG.getAllOnesConstant(DL, SrcVT); | |||
| 45222 | ||||
| 45223 | if (Mask) | |||
| 45224 | MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); | |||
| 45225 | ||||
| 45226 | // For v16i8 cases we need to perform UMIN on pairs of byte elements, | |||
| 45227 | // shuffling each upper element down and insert zeros. This means that the | |||
| 45228 | // v16i8 UMIN will leave the upper element as zero, performing zero-extension | |||
| 45229 | // ready for the PHMINPOS. | |||
| 45230 | if (ExtractVT == MVT::i8) { | |||
| 45231 | SDValue Upper = DAG.getVectorShuffle( | |||
| 45232 | SrcVT, DL, MinPos, DAG.getConstant(0, DL, MVT::v16i8), | |||
| 45233 | {1, 16, 3, 16, 5, 16, 7, 16, 9, 16, 11, 16, 13, 16, 15, 16}); | |||
| 45234 | MinPos = DAG.getNode(ISD::UMIN, DL, SrcVT, MinPos, Upper); | |||
| 45235 | } | |||
| 45236 | ||||
| 45237 | // Perform the PHMINPOS on a v8i16 vector, | |||
| 45238 | MinPos = DAG.getBitcast(MVT::v8i16, MinPos); | |||
| 45239 | MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, MVT::v8i16, MinPos); | |||
| 45240 | MinPos = DAG.getBitcast(SrcVT, MinPos); | |||
| 45241 | ||||
| 45242 | if (Mask) | |||
| 45243 | MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); | |||
| 45244 | ||||
| 45245 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, | |||
| 45246 | DAG.getIntPtrConstant(0, DL)); | |||
| 45247 | } | |||
| 45248 | ||||
| 45249 | // Attempt to replace an all_of/any_of/parity style horizontal reduction with a MOVMSK. | |||
| 45250 | static SDValue combinePredicateReduction(SDNode *Extract, SelectionDAG &DAG, | |||
| 45251 | const X86Subtarget &Subtarget) { | |||
| 45252 | // Bail without SSE2. | |||
| 45253 | if (!Subtarget.hasSSE2()) | |||
| 45254 | return SDValue(); | |||
| 45255 | ||||
| 45256 | EVT ExtractVT = Extract->getValueType(0); | |||
| 45257 | unsigned BitWidth = ExtractVT.getSizeInBits(); | |||
| 45258 | if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 && | |||
| 45259 | ExtractVT != MVT::i8 && ExtractVT != MVT::i1) | |||
| 45260 | return SDValue(); | |||
| 45261 | ||||
| 45262 | // Check for OR(any_of)/AND(all_of)/XOR(parity) horizontal reduction patterns. | |||
| 45263 | ISD::NodeType BinOp; | |||
| 45264 | SDValue Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); | |||
| 45265 | if (!Match && ExtractVT == MVT::i1) | |||
| 45266 | Match = DAG.matchBinOpReduction(Extract, BinOp, {ISD::XOR}); | |||
| 45267 | if (!Match) | |||
| 45268 | return SDValue(); | |||
| 45269 | ||||
| 45270 | // EXTRACT_VECTOR_ELT can require implicit extension of the vector element | |||
| 45271 | // which we can't support here for now. | |||
| 45272 | if (Match.getScalarValueSizeInBits() != BitWidth) | |||
| 45273 | return SDValue(); | |||
| 45274 | ||||
| 45275 | SDValue Movmsk; | |||
| 45276 | SDLoc DL(Extract); | |||
| 45277 | EVT MatchVT = Match.getValueType(); | |||
| 45278 | unsigned NumElts = MatchVT.getVectorNumElements(); | |||
| 45279 | unsigned MaxElts = Subtarget.hasInt256() ? 32 : 16; | |||
| 45280 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 45281 | LLVMContext &Ctx = *DAG.getContext(); | |||
| 45282 | ||||
| 45283 | if (ExtractVT == MVT::i1) { | |||
| 45284 | // Special case for (pre-legalization) vXi1 reductions. | |||
| 45285 | if (NumElts > 64 || !isPowerOf2_32(NumElts)) | |||
| 45286 | return SDValue(); | |||
| 45287 | if (Match.getOpcode() == ISD::SETCC) { | |||
| 45288 | ISD::CondCode CC = cast<CondCodeSDNode>(Match.getOperand(2))->get(); | |||
| 45289 | if ((BinOp == ISD::AND && CC == ISD::CondCode::SETEQ) || | |||
| 45290 | (BinOp == ISD::OR && CC == ISD::CondCode::SETNE)) { | |||
| 45291 | // For all_of(setcc(x,y,eq)) - use (iX)x == (iX)y. | |||
| 45292 | // For any_of(setcc(x,y,ne)) - use (iX)x != (iX)y. | |||
| 45293 | X86::CondCode X86CC; | |||
| 45294 | SDValue LHS = DAG.getFreeze(Match.getOperand(0)); | |||
| 45295 | SDValue RHS = DAG.getFreeze(Match.getOperand(1)); | |||
| 45296 | APInt Mask = APInt::getAllOnes(LHS.getScalarValueSizeInBits()); | |||
| 45297 | if (SDValue V = LowerVectorAllEqual(DL, LHS, RHS, CC, Mask, Subtarget, | |||
| 45298 | DAG, X86CC)) | |||
| 45299 | return DAG.getNode(ISD::TRUNCATE, DL, ExtractVT, | |||
| 45300 | getSETCC(X86CC, V, DL, DAG)); | |||
| 45301 | } | |||
| 45302 | } | |||
| 45303 | if (TLI.isTypeLegal(MatchVT)) { | |||
| 45304 | // If this is a legal AVX512 predicate type then we can just bitcast. | |||
| 45305 | EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts); | |||
| 45306 | Movmsk = DAG.getBitcast(MovmskVT, Match); | |||
| 45307 | } else { | |||
| 45308 | // Use combineBitcastvxi1 to create the MOVMSK. | |||
| 45309 | while (NumElts > MaxElts) { | |||
| 45310 | SDValue Lo, Hi; | |||
| 45311 | std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); | |||
| 45312 | Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); | |||
| 45313 | NumElts /= 2; | |||
| 45314 | } | |||
| 45315 | EVT MovmskVT = EVT::getIntegerVT(Ctx, NumElts); | |||
| 45316 | Movmsk = combineBitcastvxi1(DAG, MovmskVT, Match, DL, Subtarget); | |||
| 45317 | } | |||
| 45318 | if (!Movmsk) | |||
| 45319 | return SDValue(); | |||
| 45320 | Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32); | |||
| 45321 | } else { | |||
| 45322 | // FIXME: Better handling of k-registers or 512-bit vectors? | |||
| 45323 | unsigned MatchSizeInBits = Match.getValueSizeInBits(); | |||
| 45324 | if (!(MatchSizeInBits == 128 || | |||
| 45325 | (MatchSizeInBits == 256 && Subtarget.hasAVX()))) | |||
| 45326 | return SDValue(); | |||
| 45327 | ||||
| 45328 | // Make sure this isn't a vector of 1 element. The perf win from using | |||
| 45329 | // MOVMSK diminishes with less elements in the reduction, but it is | |||
| 45330 | // generally better to get the comparison over to the GPRs as soon as | |||
| 45331 | // possible to reduce the number of vector ops. | |||
| 45332 | if (Match.getValueType().getVectorNumElements() < 2) | |||
| 45333 | return SDValue(); | |||
| 45334 | ||||
| 45335 | // Check that we are extracting a reduction of all sign bits. | |||
| 45336 | if (DAG.ComputeNumSignBits(Match) != BitWidth) | |||
| 45337 | return SDValue(); | |||
| 45338 | ||||
| 45339 | if (MatchSizeInBits == 256 && BitWidth < 32 && !Subtarget.hasInt256()) { | |||
| 45340 | SDValue Lo, Hi; | |||
| 45341 | std::tie(Lo, Hi) = DAG.SplitVector(Match, DL); | |||
| 45342 | Match = DAG.getNode(BinOp, DL, Lo.getValueType(), Lo, Hi); | |||
| 45343 | MatchSizeInBits = Match.getValueSizeInBits(); | |||
| 45344 | } | |||
| 45345 | ||||
| 45346 | // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. | |||
| 45347 | MVT MaskSrcVT; | |||
| 45348 | if (64 == BitWidth || 32 == BitWidth) | |||
| 45349 | MaskSrcVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), | |||
| 45350 | MatchSizeInBits / BitWidth); | |||
| 45351 | else | |||
| 45352 | MaskSrcVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); | |||
| 45353 | ||||
| 45354 | SDValue BitcastLogicOp = DAG.getBitcast(MaskSrcVT, Match); | |||
| 45355 | Movmsk = getPMOVMSKB(DL, BitcastLogicOp, DAG, Subtarget); | |||
| 45356 | NumElts = MaskSrcVT.getVectorNumElements(); | |||
| 45357 | } | |||
| 45358 | assert((NumElts <= 32 || NumElts == 64) &&(static_cast <bool> ((NumElts <= 32 || NumElts == 64 ) && "Not expecting more than 64 elements") ? void (0 ) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__ __PRETTY_FUNCTION__)) | |||
| 45359 | "Not expecting more than 64 elements")(static_cast <bool> ((NumElts <= 32 || NumElts == 64 ) && "Not expecting more than 64 elements") ? void (0 ) : __assert_fail ("(NumElts <= 32 || NumElts == 64) && \"Not expecting more than 64 elements\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45359, __extension__ __PRETTY_FUNCTION__)); | |||
| 45360 | ||||
| 45361 | MVT CmpVT = NumElts == 64 ? MVT::i64 : MVT::i32; | |||
| 45362 | if (BinOp == ISD::XOR) { | |||
| 45363 | // parity -> (PARITY(MOVMSK X)) | |||
| 45364 | SDValue Result = DAG.getNode(ISD::PARITY, DL, CmpVT, Movmsk); | |||
| 45365 | return DAG.getZExtOrTrunc(Result, DL, ExtractVT); | |||
| 45366 | } | |||
| 45367 | ||||
| 45368 | SDValue CmpC; | |||
| 45369 | ISD::CondCode CondCode; | |||
| 45370 | if (BinOp == ISD::OR) { | |||
| 45371 | // any_of -> MOVMSK != 0 | |||
| 45372 | CmpC = DAG.getConstant(0, DL, CmpVT); | |||
| 45373 | CondCode = ISD::CondCode::SETNE; | |||
| 45374 | } else { | |||
| 45375 | // all_of -> MOVMSK == ((1 << NumElts) - 1) | |||
| 45376 | CmpC = DAG.getConstant(APInt::getLowBitsSet(CmpVT.getSizeInBits(), NumElts), | |||
| 45377 | DL, CmpVT); | |||
| 45378 | CondCode = ISD::CondCode::SETEQ; | |||
| 45379 | } | |||
| 45380 | ||||
| 45381 | // The setcc produces an i8 of 0/1, so extend that to the result width and | |||
| 45382 | // negate to get the final 0/-1 mask value. | |||
| 45383 | EVT SetccVT = TLI.getSetCCResultType(DAG.getDataLayout(), Ctx, CmpVT); | |||
| 45384 | SDValue Setcc = DAG.getSetCC(DL, SetccVT, Movmsk, CmpC, CondCode); | |||
| 45385 | SDValue Zext = DAG.getZExtOrTrunc(Setcc, DL, ExtractVT); | |||
| 45386 | SDValue Zero = DAG.getConstant(0, DL, ExtractVT); | |||
| 45387 | return DAG.getNode(ISD::SUB, DL, ExtractVT, Zero, Zext); | |||
| 45388 | } | |||
| 45389 | ||||
| 45390 | static SDValue combineVPDPBUSDPattern(SDNode *Extract, SelectionDAG &DAG, | |||
| 45391 | const X86Subtarget &Subtarget) { | |||
| 45392 | if (!Subtarget.hasVNNI() && !Subtarget.hasAVXVNNI()) | |||
| 45393 | return SDValue(); | |||
| 45394 | ||||
| 45395 | EVT ExtractVT = Extract->getValueType(0); | |||
| 45396 | // Verify the type we're extracting is i32, as the output element type of | |||
| 45397 | // vpdpbusd is i32. | |||
| 45398 | if (ExtractVT != MVT::i32) | |||
| 45399 | return SDValue(); | |||
| 45400 | ||||
| 45401 | EVT VT = Extract->getOperand(0).getValueType(); | |||
| 45402 | if (!isPowerOf2_32(VT.getVectorNumElements())) | |||
| 45403 | return SDValue(); | |||
| 45404 | ||||
| 45405 | // Match shuffle + add pyramid. | |||
| 45406 | ISD::NodeType BinOp; | |||
| 45407 | SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); | |||
| 45408 | ||||
| 45409 | // We can't combine to vpdpbusd for zext, because each of the 4 multiplies | |||
| 45410 | // done by vpdpbusd compute a signed 16-bit product that will be sign extended | |||
| 45411 | // before adding into the accumulator. | |||
| 45412 | // TODO: | |||
| 45413 | // We also need to verify that the multiply has at least 2x the number of bits | |||
| 45414 | // of the input. We shouldn't match | |||
| 45415 | // (sign_extend (mul (vXi9 (zext (vXi8 X))), (vXi9 (zext (vXi8 Y)))). | |||
| 45416 | // if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND)) | |||
| 45417 | // Root = Root.getOperand(0); | |||
| 45418 | ||||
| 45419 | // If there was a match, we want Root to be a mul. | |||
| 45420 | if (!Root || Root.getOpcode() != ISD::MUL) | |||
| 45421 | return SDValue(); | |||
| 45422 | ||||
| 45423 | // Check whether we have an extend and mul pattern | |||
| 45424 | SDValue LHS, RHS; | |||
| 45425 | if (!detectExtMul(DAG, Root, LHS, RHS)) | |||
| 45426 | return SDValue(); | |||
| 45427 | ||||
| 45428 | // Create the dot product instruction. | |||
| 45429 | SDLoc DL(Extract); | |||
| 45430 | unsigned StageBias; | |||
| 45431 | SDValue DP = createVPDPBUSD(DAG, LHS, RHS, StageBias, DL, Subtarget); | |||
| 45432 | ||||
| 45433 | // If the original vector was wider than 4 elements, sum over the results | |||
| 45434 | // in the DP vector. | |||
| 45435 | unsigned Stages = Log2_32(VT.getVectorNumElements()); | |||
| 45436 | EVT DpVT = DP.getValueType(); | |||
| 45437 | ||||
| 45438 | if (Stages > StageBias) { | |||
| 45439 | unsigned DpElems = DpVT.getVectorNumElements(); | |||
| 45440 | ||||
| 45441 | for (unsigned i = Stages - StageBias; i > 0; --i) { | |||
| 45442 | SmallVector<int, 16> Mask(DpElems, -1); | |||
| 45443 | for (unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) | |||
| 45444 | Mask[j] = MaskEnd + j; | |||
| 45445 | ||||
| 45446 | SDValue Shuffle = | |||
| 45447 | DAG.getVectorShuffle(DpVT, DL, DP, DAG.getUNDEF(DpVT), Mask); | |||
| 45448 | DP = DAG.getNode(ISD::ADD, DL, DpVT, DP, Shuffle); | |||
| 45449 | } | |||
| 45450 | } | |||
| 45451 | ||||
| 45452 | // Return the lowest ExtractSizeInBits bits. | |||
| 45453 | EVT ResVT = | |||
| 45454 | EVT::getVectorVT(*DAG.getContext(), ExtractVT, | |||
| 45455 | DpVT.getSizeInBits() / ExtractVT.getSizeInBits()); | |||
| 45456 | DP = DAG.getBitcast(ResVT, DP); | |||
| 45457 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, DP, | |||
| 45458 | Extract->getOperand(1)); | |||
| 45459 | } | |||
| 45460 | ||||
| 45461 | static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, | |||
| 45462 | const X86Subtarget &Subtarget) { | |||
| 45463 | // PSADBW is only supported on SSE2 and up. | |||
| 45464 | if (!Subtarget.hasSSE2()) | |||
| ||||
| 45465 | return SDValue(); | |||
| 45466 | ||||
| 45467 | EVT ExtractVT = Extract->getValueType(0); | |||
| 45468 | // Verify the type we're extracting is either i32 or i64. | |||
| 45469 | // FIXME: Could support other types, but this is what we have coverage for. | |||
| 45470 | if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64) | |||
| 45471 | return SDValue(); | |||
| 45472 | ||||
| 45473 | EVT VT = Extract->getOperand(0).getValueType(); | |||
| 45474 | if (!isPowerOf2_32(VT.getVectorNumElements())) | |||
| 45475 | return SDValue(); | |||
| 45476 | ||||
| 45477 | // Match shuffle + add pyramid. | |||
| 45478 | ISD::NodeType BinOp; | |||
| 45479 | SDValue Root = DAG.matchBinOpReduction(Extract, BinOp, {ISD::ADD}); | |||
| 45480 | ||||
| 45481 | // The operand is expected to be zero extended from i8 | |||
| 45482 | // (verified in detectZextAbsDiff). | |||
| 45483 | // In order to convert to i64 and above, additional any/zero/sign | |||
| 45484 | // extend is expected. | |||
| 45485 | // The zero extend from 32 bit has no mathematical effect on the result. | |||
| 45486 | // Also the sign extend is basically zero extend | |||
| 45487 | // (extends the sign bit which is zero). | |||
| 45488 | // So it is correct to skip the sign/zero extend instruction. | |||
| 45489 | if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || | |||
| 45490 | Root.getOpcode() == ISD::ZERO_EXTEND || | |||
| 45491 | Root.getOpcode() == ISD::ANY_EXTEND)) | |||
| 45492 | Root = Root.getOperand(0); | |||
| 45493 | ||||
| 45494 | // If there was a match, we want Root to be a select that is the root of an | |||
| 45495 | // abs-diff pattern. | |||
| 45496 | if (!Root || Root.getOpcode() != ISD::ABS) | |||
| 45497 | return SDValue(); | |||
| 45498 | ||||
| 45499 | // Check whether we have an abs-diff pattern feeding into the select. | |||
| 45500 | SDValue Zext0, Zext1; | |||
| 45501 | if (!detectZextAbsDiff(Root, Zext0, Zext1)) | |||
| 45502 | return SDValue(); | |||
| 45503 | ||||
| 45504 | // Create the SAD instruction. | |||
| 45505 | SDLoc DL(Extract); | |||
| 45506 | SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL, Subtarget); | |||
| 45507 | ||||
| 45508 | // If the original vector was wider than 8 elements, sum over the results | |||
| 45509 | // in the SAD vector. | |||
| 45510 | unsigned Stages = Log2_32(VT.getVectorNumElements()); | |||
| 45511 | EVT SadVT = SAD.getValueType(); | |||
| 45512 | if (Stages
| |||
| 45513 | unsigned SadElems = SadVT.getVectorNumElements(); | |||
| 45514 | ||||
| 45515 | for(unsigned i = Stages - 3; i > 0; --i) { | |||
| 45516 | SmallVector<int, 16> Mask(SadElems, -1); | |||
| 45517 | for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j) | |||
| ||||
| 45518 | Mask[j] = MaskEnd + j; | |||
| 45519 | ||||
| 45520 | SDValue Shuffle = | |||
| 45521 | DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask); | |||
| 45522 | SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle); | |||
| 45523 | } | |||
| 45524 | } | |||
| 45525 | ||||
| 45526 | unsigned ExtractSizeInBits = ExtractVT.getSizeInBits(); | |||
| 45527 | // Return the lowest ExtractSizeInBits bits. | |||
| 45528 | EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT, | |||
| 45529 | SadVT.getSizeInBits() / ExtractSizeInBits); | |||
| 45530 | SAD = DAG.getBitcast(ResVT, SAD); | |||
| 45531 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD, | |||
| 45532 | Extract->getOperand(1)); | |||
| 45533 | } | |||
| 45534 | ||||
| 45535 | // Attempt to peek through a target shuffle and extract the scalar from the | |||
| 45536 | // source. | |||
| 45537 | static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, | |||
| 45538 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 45539 | const X86Subtarget &Subtarget) { | |||
| 45540 | if (DCI.isBeforeLegalizeOps()) | |||
| 45541 | return SDValue(); | |||
| 45542 | ||||
| 45543 | SDLoc dl(N); | |||
| 45544 | SDValue Src = N->getOperand(0); | |||
| 45545 | SDValue Idx = N->getOperand(1); | |||
| 45546 | ||||
| 45547 | EVT VT = N->getValueType(0); | |||
| 45548 | EVT SrcVT = Src.getValueType(); | |||
| 45549 | EVT SrcSVT = SrcVT.getVectorElementType(); | |||
| 45550 | unsigned SrcEltBits = SrcSVT.getSizeInBits(); | |||
| 45551 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 45552 | ||||
| 45553 | // Don't attempt this for boolean mask vectors or unknown extraction indices. | |||
| 45554 | if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx)) | |||
| 45555 | return SDValue(); | |||
| 45556 | ||||
| 45557 | const APInt &IdxC = N->getConstantOperandAPInt(1); | |||
| 45558 | if (IdxC.uge(NumSrcElts)) | |||
| 45559 | return SDValue(); | |||
| 45560 | ||||
| 45561 | SDValue SrcBC = peekThroughBitcasts(Src); | |||
| 45562 | ||||
| 45563 | // Handle extract(bitcast(broadcast(scalar_value))). | |||
| 45564 | if (X86ISD::VBROADCAST == SrcBC.getOpcode()) { | |||
| 45565 | SDValue SrcOp = SrcBC.getOperand(0); | |||
| 45566 | EVT SrcOpVT = SrcOp.getValueType(); | |||
| 45567 | if (SrcOpVT.isScalarInteger() && VT.isInteger() && | |||
| 45568 | (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) { | |||
| 45569 | unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits; | |||
| 45570 | unsigned Offset = IdxC.urem(Scale) * SrcEltBits; | |||
| 45571 | // TODO support non-zero offsets. | |||
| 45572 | if (Offset == 0) { | |||
| 45573 | SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType()); | |||
| 45574 | SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT); | |||
| 45575 | return SrcOp; | |||
| 45576 | } | |||
| 45577 | } | |||
| 45578 | } | |||
| 45579 | ||||
| 45580 | // If we're extracting a single element from a broadcast load and there are | |||
| 45581 | // no other users, just create a single load. | |||
| 45582 | if (SrcBC.getOpcode() == X86ISD::VBROADCAST_LOAD && SrcBC.hasOneUse()) { | |||
| 45583 | auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC); | |||
| 45584 | unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits(); | |||
| 45585 | if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth && | |||
| 45586 | VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) { | |||
| 45587 | SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(), | |||
| 45588 | MemIntr->getBasePtr(), | |||
| 45589 | MemIntr->getPointerInfo(), | |||
| 45590 | MemIntr->getOriginalAlign(), | |||
| 45591 | MemIntr->getMemOperand()->getFlags()); | |||
| 45592 | DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1)); | |||
| 45593 | return Load; | |||
| 45594 | } | |||
| 45595 | } | |||
| 45596 | ||||
| 45597 | // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers. | |||
| 45598 | // TODO: Move to DAGCombine? | |||
| 45599 | if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() && | |||
| 45600 | SrcBC.getValueType().isInteger() && | |||
| 45601 | (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 && | |||
| 45602 | SrcBC.getScalarValueSizeInBits() == | |||
| 45603 | SrcBC.getOperand(0).getValueSizeInBits()) { | |||
| 45604 | unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits; | |||
| 45605 | if (IdxC.ult(Scale)) { | |||
| 45606 | unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits(); | |||
| 45607 | SDValue Scl = SrcBC.getOperand(0); | |||
| 45608 | EVT SclVT = Scl.getValueType(); | |||
| 45609 | if (Offset) { | |||
| 45610 | Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl, | |||
| 45611 | DAG.getShiftAmountConstant(Offset, SclVT, dl)); | |||
| 45612 | } | |||
| 45613 | Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType()); | |||
| 45614 | Scl = DAG.getZExtOrTrunc(Scl, dl, VT); | |||
| 45615 | return Scl; | |||
| 45616 | } | |||
| 45617 | } | |||
| 45618 | ||||
| 45619 | // Handle extract(truncate(x)) for 0'th index. | |||
| 45620 | // TODO: Treat this as a faux shuffle? | |||
| 45621 | // TODO: When can we use this for general indices? | |||
| 45622 | if (ISD::TRUNCATE == Src.getOpcode() && IdxC == 0 && | |||
| 45623 | (SrcVT.getSizeInBits() % 128) == 0) { | |||
| 45624 | Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl); | |||
| 45625 | MVT ExtractVT = MVT::getVectorVT(SrcSVT.getSimpleVT(), 128 / SrcEltBits); | |||
| 45626 | return DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(ExtractVT, Src), | |||
| 45627 | Idx); | |||
| 45628 | } | |||
| 45629 | ||||
| 45630 | // We can only legally extract other elements from 128-bit vectors and in | |||
| 45631 | // certain circumstances, depending on SSE-level. | |||
| 45632 | // TODO: Investigate float/double extraction if it will be just stored. | |||
| 45633 | auto GetLegalExtract = [&Subtarget, &DAG, &dl](SDValue Vec, EVT VecVT, | |||
| 45634 | unsigned Idx) { | |||
| 45635 | EVT VecSVT = VecVT.getScalarType(); | |||
| 45636 | if ((VecVT.is256BitVector() || VecVT.is512BitVector()) && | |||
| 45637 | (VecSVT == MVT::i8 || VecSVT == MVT::i16 || VecSVT == MVT::i32 || | |||
| 45638 | VecSVT == MVT::i64)) { | |||
| 45639 | unsigned EltSizeInBits = VecSVT.getSizeInBits(); | |||
| 45640 | unsigned NumEltsPerLane = 128 / EltSizeInBits; | |||
| 45641 | unsigned LaneOffset = (Idx & ~(NumEltsPerLane - 1)) * EltSizeInBits; | |||
| 45642 | unsigned LaneIdx = LaneOffset / Vec.getScalarValueSizeInBits(); | |||
| 45643 | VecVT = EVT::getVectorVT(*DAG.getContext(), VecSVT, NumEltsPerLane); | |||
| 45644 | Vec = extract128BitVector(Vec, LaneIdx, DAG, dl); | |||
| 45645 | Idx &= (NumEltsPerLane - 1); | |||
| 45646 | } | |||
| 45647 | if ((VecVT == MVT::v4i32 || VecVT == MVT::v2i64) && | |||
| 45648 | ((Idx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) { | |||
| 45649 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VecVT.getScalarType(), | |||
| 45650 | DAG.getBitcast(VecVT, Vec), | |||
| 45651 | DAG.getIntPtrConstant(Idx, dl)); | |||
| 45652 | } | |||
| 45653 | if ((VecVT == MVT::v8i16 && Subtarget.hasSSE2()) || | |||
| 45654 | (VecVT == MVT::v16i8 && Subtarget.hasSSE41())) { | |||
| 45655 | unsigned OpCode = (VecVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); | |||
| 45656 | return DAG.getNode(OpCode, dl, MVT::i32, DAG.getBitcast(VecVT, Vec), | |||
| 45657 | DAG.getTargetConstant(Idx, dl, MVT::i8)); | |||
| 45658 | } | |||
| 45659 | return SDValue(); | |||
| 45660 | }; | |||
| 45661 | ||||
| 45662 | // Resolve the target shuffle inputs and mask. | |||
| 45663 | SmallVector<int, 16> Mask; | |||
| 45664 | SmallVector<SDValue, 2> Ops; | |||
| 45665 | if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG)) | |||
| 45666 | return SDValue(); | |||
| 45667 | ||||
| 45668 | // Shuffle inputs must be the same size as the result. | |||
| 45669 | if (llvm::any_of(Ops, [SrcVT](SDValue Op) { | |||
| 45670 | return SrcVT.getSizeInBits() != Op.getValueSizeInBits(); | |||
| 45671 | })) | |||
| 45672 | return SDValue(); | |||
| 45673 | ||||
| 45674 | // Attempt to narrow/widen the shuffle mask to the correct size. | |||
| 45675 | if (Mask.size() != NumSrcElts) { | |||
| 45676 | if ((NumSrcElts % Mask.size()) == 0) { | |||
| 45677 | SmallVector<int, 16> ScaledMask; | |||
| 45678 | int Scale = NumSrcElts / Mask.size(); | |||
| 45679 | narrowShuffleMaskElts(Scale, Mask, ScaledMask); | |||
| 45680 | Mask = std::move(ScaledMask); | |||
| 45681 | } else if ((Mask.size() % NumSrcElts) == 0) { | |||
| 45682 | // Simplify Mask based on demanded element. | |||
| 45683 | int ExtractIdx = (int)IdxC.getZExtValue(); | |||
| 45684 | int Scale = Mask.size() / NumSrcElts; | |||
| 45685 | int Lo = Scale * ExtractIdx; | |||
| 45686 | int Hi = Scale * (ExtractIdx + 1); | |||
| 45687 | for (int i = 0, e = (int)Mask.size(); i != e; ++i) | |||
| 45688 | if (i < Lo || Hi <= i) | |||
| 45689 | Mask[i] = SM_SentinelUndef; | |||
| 45690 | ||||
| 45691 | SmallVector<int, 16> WidenedMask; | |||
| 45692 | while (Mask.size() > NumSrcElts && | |||
| 45693 | canWidenShuffleElements(Mask, WidenedMask)) | |||
| 45694 | Mask = std::move(WidenedMask); | |||
| 45695 | } | |||
| 45696 | } | |||
| 45697 | ||||
| 45698 | // If narrowing/widening failed, see if we can extract+zero-extend. | |||
| 45699 | int ExtractIdx; | |||
| 45700 | EVT ExtractVT; | |||
| 45701 | if (Mask.size() == NumSrcElts) { | |||
| 45702 | ExtractIdx = Mask[IdxC.getZExtValue()]; | |||
| 45703 | ExtractVT = SrcVT; | |||
| 45704 | } else { | |||
| 45705 | unsigned Scale = Mask.size() / NumSrcElts; | |||
| 45706 | if ((Mask.size() % NumSrcElts) != 0 || SrcVT.isFloatingPoint()) | |||
| 45707 | return SDValue(); | |||
| 45708 | unsigned ScaledIdx = Scale * IdxC.getZExtValue(); | |||
| 45709 | if (!isUndefOrZeroInRange(Mask, ScaledIdx + 1, Scale - 1)) | |||
| 45710 | return SDValue(); | |||
| 45711 | ExtractIdx = Mask[ScaledIdx]; | |||
| 45712 | EVT ExtractSVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltBits / Scale); | |||
| 45713 | ExtractVT = EVT::getVectorVT(*DAG.getContext(), ExtractSVT, Mask.size()); | |||
| 45714 | assert(SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() &&(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT .getSizeInBits() && "Failed to widen vector type") ? void (0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__ __PRETTY_FUNCTION__)) | |||
| 45715 | "Failed to widen vector type")(static_cast <bool> (SrcVT.getSizeInBits() == ExtractVT .getSizeInBits() && "Failed to widen vector type") ? void (0) : __assert_fail ("SrcVT.getSizeInBits() == ExtractVT.getSizeInBits() && \"Failed to widen vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45715, __extension__ __PRETTY_FUNCTION__)); | |||
| 45716 | } | |||
| 45717 | ||||
| 45718 | // If the shuffle source element is undef/zero then we can just accept it. | |||
| 45719 | if (ExtractIdx == SM_SentinelUndef) | |||
| 45720 | return DAG.getUNDEF(VT); | |||
| 45721 | ||||
| 45722 | if (ExtractIdx == SM_SentinelZero) | |||
| 45723 | return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT) | |||
| 45724 | : DAG.getConstant(0, dl, VT); | |||
| 45725 | ||||
| 45726 | SDValue SrcOp = Ops[ExtractIdx / Mask.size()]; | |||
| 45727 | ExtractIdx = ExtractIdx % Mask.size(); | |||
| 45728 | if (SDValue V = GetLegalExtract(SrcOp, ExtractVT, ExtractIdx)) | |||
| 45729 | return DAG.getZExtOrTrunc(V, dl, VT); | |||
| 45730 | ||||
| 45731 | return SDValue(); | |||
| 45732 | } | |||
| 45733 | ||||
| 45734 | /// Extracting a scalar FP value from vector element 0 is free, so extract each | |||
| 45735 | /// operand first, then perform the math as a scalar op. | |||
| 45736 | static SDValue scalarizeExtEltFP(SDNode *ExtElt, SelectionDAG &DAG, | |||
| 45737 | const X86Subtarget &Subtarget) { | |||
| 45738 | assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Expected extract") ? void (0) : __assert_fail ("ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Expected extract\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45738, __extension__ __PRETTY_FUNCTION__)); | |||
| 45739 | SDValue Vec = ExtElt->getOperand(0); | |||
| 45740 | SDValue Index = ExtElt->getOperand(1); | |||
| 45741 | EVT VT = ExtElt->getValueType(0); | |||
| 45742 | EVT VecVT = Vec.getValueType(); | |||
| 45743 | ||||
| 45744 | // TODO: If this is a unary/expensive/expand op, allow extraction from a | |||
| 45745 | // non-zero element because the shuffle+scalar op will be cheaper? | |||
| 45746 | if (!Vec.hasOneUse() || !isNullConstant(Index) || VecVT.getScalarType() != VT) | |||
| 45747 | return SDValue(); | |||
| 45748 | ||||
| 45749 | // Vector FP compares don't fit the pattern of FP math ops (propagate, not | |||
| 45750 | // extract, the condition code), so deal with those as a special-case. | |||
| 45751 | if (Vec.getOpcode() == ISD::SETCC && VT == MVT::i1) { | |||
| 45752 | EVT OpVT = Vec.getOperand(0).getValueType().getScalarType(); | |||
| 45753 | if (OpVT != MVT::f32 && OpVT != MVT::f64) | |||
| 45754 | return SDValue(); | |||
| 45755 | ||||
| 45756 | // extract (setcc X, Y, CC), 0 --> setcc (extract X, 0), (extract Y, 0), CC | |||
| 45757 | SDLoc DL(ExtElt); | |||
| 45758 | SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, | |||
| 45759 | Vec.getOperand(0), Index); | |||
| 45760 | SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, | |||
| 45761 | Vec.getOperand(1), Index); | |||
| 45762 | return DAG.getNode(Vec.getOpcode(), DL, VT, Ext0, Ext1, Vec.getOperand(2)); | |||
| 45763 | } | |||
| 45764 | ||||
| 45765 | if (!(VT == MVT::f16 && Subtarget.hasFP16()) && VT != MVT::f32 && | |||
| 45766 | VT != MVT::f64) | |||
| 45767 | return SDValue(); | |||
| 45768 | ||||
| 45769 | // Vector FP selects don't fit the pattern of FP math ops (because the | |||
| 45770 | // condition has a different type and we have to change the opcode), so deal | |||
| 45771 | // with those here. | |||
| 45772 | // FIXME: This is restricted to pre type legalization by ensuring the setcc | |||
| 45773 | // has i1 elements. If we loosen this we need to convert vector bool to a | |||
| 45774 | // scalar bool. | |||
| 45775 | if (Vec.getOpcode() == ISD::VSELECT && | |||
| 45776 | Vec.getOperand(0).getOpcode() == ISD::SETCC && | |||
| 45777 | Vec.getOperand(0).getValueType().getScalarType() == MVT::i1 && | |||
| 45778 | Vec.getOperand(0).getOperand(0).getValueType() == VecVT) { | |||
| 45779 | // ext (sel Cond, X, Y), 0 --> sel (ext Cond, 0), (ext X, 0), (ext Y, 0) | |||
| 45780 | SDLoc DL(ExtElt); | |||
| 45781 | SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, | |||
| 45782 | Vec.getOperand(0).getValueType().getScalarType(), | |||
| 45783 | Vec.getOperand(0), Index); | |||
| 45784 | SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, | |||
| 45785 | Vec.getOperand(1), Index); | |||
| 45786 | SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, | |||
| 45787 | Vec.getOperand(2), Index); | |||
| 45788 | return DAG.getNode(ISD::SELECT, DL, VT, Ext0, Ext1, Ext2); | |||
| 45789 | } | |||
| 45790 | ||||
| 45791 | // TODO: This switch could include FNEG and the x86-specific FP logic ops | |||
| 45792 | // (FAND, FANDN, FOR, FXOR). But that may require enhancements to avoid | |||
| 45793 | // missed load folding and fma+fneg combining. | |||
| 45794 | switch (Vec.getOpcode()) { | |||
| 45795 | case ISD::FMA: // Begin 3 operands | |||
| 45796 | case ISD::FMAD: | |||
| 45797 | case ISD::FADD: // Begin 2 operands | |||
| 45798 | case ISD::FSUB: | |||
| 45799 | case ISD::FMUL: | |||
| 45800 | case ISD::FDIV: | |||
| 45801 | case ISD::FREM: | |||
| 45802 | case ISD::FCOPYSIGN: | |||
| 45803 | case ISD::FMINNUM: | |||
| 45804 | case ISD::FMAXNUM: | |||
| 45805 | case ISD::FMINNUM_IEEE: | |||
| 45806 | case ISD::FMAXNUM_IEEE: | |||
| 45807 | case ISD::FMAXIMUM: | |||
| 45808 | case ISD::FMINIMUM: | |||
| 45809 | case X86ISD::FMAX: | |||
| 45810 | case X86ISD::FMIN: | |||
| 45811 | case ISD::FABS: // Begin 1 operand | |||
| 45812 | case ISD::FSQRT: | |||
| 45813 | case ISD::FRINT: | |||
| 45814 | case ISD::FCEIL: | |||
| 45815 | case ISD::FTRUNC: | |||
| 45816 | case ISD::FNEARBYINT: | |||
| 45817 | case ISD::FROUND: | |||
| 45818 | case ISD::FFLOOR: | |||
| 45819 | case X86ISD::FRCP: | |||
| 45820 | case X86ISD::FRSQRT: { | |||
| 45821 | // extract (fp X, Y, ...), 0 --> fp (extract X, 0), (extract Y, 0), ... | |||
| 45822 | SDLoc DL(ExtElt); | |||
| 45823 | SmallVector<SDValue, 4> ExtOps; | |||
| 45824 | for (SDValue Op : Vec->ops()) | |||
| 45825 | ExtOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op, Index)); | |||
| 45826 | return DAG.getNode(Vec.getOpcode(), DL, VT, ExtOps); | |||
| 45827 | } | |||
| 45828 | default: | |||
| 45829 | return SDValue(); | |||
| 45830 | } | |||
| 45831 | llvm_unreachable("All opcodes should return within switch")::llvm::llvm_unreachable_internal("All opcodes should return within switch" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45831); | |||
| 45832 | } | |||
| 45833 | ||||
| 45834 | /// Try to convert a vector reduction sequence composed of binops and shuffles | |||
| 45835 | /// into horizontal ops. | |||
| 45836 | static SDValue combineArithReduction(SDNode *ExtElt, SelectionDAG &DAG, | |||
| 45837 | const X86Subtarget &Subtarget) { | |||
| 45838 | assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller")(static_cast <bool> (ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller") ? void (0) : __assert_fail ( "ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && \"Unexpected caller\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45838, __extension__ __PRETTY_FUNCTION__)); | |||
| 45839 | ||||
| 45840 | // We need at least SSE2 to anything here. | |||
| 45841 | if (!Subtarget.hasSSE2()) | |||
| 45842 | return SDValue(); | |||
| 45843 | ||||
| 45844 | ISD::NodeType Opc; | |||
| 45845 | SDValue Rdx = DAG.matchBinOpReduction(ExtElt, Opc, | |||
| 45846 | {ISD::ADD, ISD::MUL, ISD::FADD}, true); | |||
| 45847 | if (!Rdx) | |||
| 45848 | return SDValue(); | |||
| 45849 | ||||
| 45850 | SDValue Index = ExtElt->getOperand(1); | |||
| 45851 | assert(isNullConstant(Index) &&(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0" ) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__ __PRETTY_FUNCTION__)) | |||
| 45852 | "Reduction doesn't end in an extract from index 0")(static_cast <bool> (isNullConstant(Index) && "Reduction doesn't end in an extract from index 0" ) ? void (0) : __assert_fail ("isNullConstant(Index) && \"Reduction doesn't end in an extract from index 0\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45852, __extension__ __PRETTY_FUNCTION__)); | |||
| 45853 | ||||
| 45854 | EVT VT = ExtElt->getValueType(0); | |||
| 45855 | EVT VecVT = Rdx.getValueType(); | |||
| 45856 | if (VecVT.getScalarType() != VT) | |||
| 45857 | return SDValue(); | |||
| 45858 | ||||
| 45859 | SDLoc DL(ExtElt); | |||
| 45860 | unsigned NumElts = VecVT.getVectorNumElements(); | |||
| 45861 | unsigned EltSizeInBits = VecVT.getScalarSizeInBits(); | |||
| 45862 | ||||
| 45863 | // Extend v4i8/v8i8 vector to v16i8, with undef upper 64-bits. | |||
| 45864 | auto WidenToV16I8 = [&](SDValue V, bool ZeroExtend) { | |||
| 45865 | if (V.getValueType() == MVT::v4i8) { | |||
| 45866 | if (ZeroExtend && Subtarget.hasSSE41()) { | |||
| 45867 | V = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v4i32, | |||
| 45868 | DAG.getConstant(0, DL, MVT::v4i32), | |||
| 45869 | DAG.getBitcast(MVT::i32, V), | |||
| 45870 | DAG.getIntPtrConstant(0, DL)); | |||
| 45871 | return DAG.getBitcast(MVT::v16i8, V); | |||
| 45872 | } | |||
| 45873 | V = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i8, V, | |||
| 45874 | ZeroExtend ? DAG.getConstant(0, DL, MVT::v4i8) | |||
| 45875 | : DAG.getUNDEF(MVT::v4i8)); | |||
| 45876 | } | |||
| 45877 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V, | |||
| 45878 | DAG.getUNDEF(MVT::v8i8)); | |||
| 45879 | }; | |||
| 45880 | ||||
| 45881 | // vXi8 mul reduction - promote to vXi16 mul reduction. | |||
| 45882 | if (Opc == ISD::MUL) { | |||
| 45883 | if (VT != MVT::i8 || NumElts < 4 || !isPowerOf2_32(NumElts)) | |||
| 45884 | return SDValue(); | |||
| 45885 | if (VecVT.getSizeInBits() >= 128) { | |||
| 45886 | EVT WideVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts / 2); | |||
| 45887 | SDValue Lo = getUnpackl(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); | |||
| 45888 | SDValue Hi = getUnpackh(DAG, DL, VecVT, Rdx, DAG.getUNDEF(VecVT)); | |||
| 45889 | Lo = DAG.getBitcast(WideVT, Lo); | |||
| 45890 | Hi = DAG.getBitcast(WideVT, Hi); | |||
| 45891 | Rdx = DAG.getNode(Opc, DL, WideVT, Lo, Hi); | |||
| 45892 | while (Rdx.getValueSizeInBits() > 128) { | |||
| 45893 | std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); | |||
| 45894 | Rdx = DAG.getNode(Opc, DL, Lo.getValueType(), Lo, Hi); | |||
| 45895 | } | |||
| 45896 | } else { | |||
| 45897 | Rdx = WidenToV16I8(Rdx, false); | |||
| 45898 | Rdx = getUnpackl(DAG, DL, MVT::v16i8, Rdx, DAG.getUNDEF(MVT::v16i8)); | |||
| 45899 | Rdx = DAG.getBitcast(MVT::v8i16, Rdx); | |||
| 45900 | } | |||
| 45901 | if (NumElts >= 8) | |||
| 45902 | Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, | |||
| 45903 | DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, | |||
| 45904 | {4, 5, 6, 7, -1, -1, -1, -1})); | |||
| 45905 | Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, | |||
| 45906 | DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, | |||
| 45907 | {2, 3, -1, -1, -1, -1, -1, -1})); | |||
| 45908 | Rdx = DAG.getNode(Opc, DL, MVT::v8i16, Rdx, | |||
| 45909 | DAG.getVectorShuffle(MVT::v8i16, DL, Rdx, Rdx, | |||
| 45910 | {1, -1, -1, -1, -1, -1, -1, -1})); | |||
| 45911 | Rdx = DAG.getBitcast(MVT::v16i8, Rdx); | |||
| 45912 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); | |||
| 45913 | } | |||
| 45914 | ||||
| 45915 | // vXi8 add reduction - sub 128-bit vector. | |||
| 45916 | if (VecVT == MVT::v4i8 || VecVT == MVT::v8i8) { | |||
| 45917 | Rdx = WidenToV16I8(Rdx, true); | |||
| 45918 | Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, | |||
| 45919 | DAG.getConstant(0, DL, MVT::v16i8)); | |||
| 45920 | Rdx = DAG.getBitcast(MVT::v16i8, Rdx); | |||
| 45921 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); | |||
| 45922 | } | |||
| 45923 | ||||
| 45924 | // Must be a >=128-bit vector with pow2 elements. | |||
| 45925 | if ((VecVT.getSizeInBits() % 128) != 0 || !isPowerOf2_32(NumElts)) | |||
| 45926 | return SDValue(); | |||
| 45927 | ||||
| 45928 | // vXi8 add reduction - sum lo/hi halves then use PSADBW. | |||
| 45929 | if (VT == MVT::i8) { | |||
| 45930 | while (Rdx.getValueSizeInBits() > 128) { | |||
| 45931 | SDValue Lo, Hi; | |||
| 45932 | std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); | |||
| 45933 | VecVT = Lo.getValueType(); | |||
| 45934 | Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); | |||
| 45935 | } | |||
| 45936 | assert(VecVT == MVT::v16i8 && "v16i8 reduction expected")(static_cast <bool> (VecVT == MVT::v16i8 && "v16i8 reduction expected" ) ? void (0) : __assert_fail ("VecVT == MVT::v16i8 && \"v16i8 reduction expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45936, __extension__ __PRETTY_FUNCTION__)); | |||
| 45937 | ||||
| 45938 | SDValue Hi = DAG.getVectorShuffle( | |||
| 45939 | MVT::v16i8, DL, Rdx, Rdx, | |||
| 45940 | {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1}); | |||
| 45941 | Rdx = DAG.getNode(ISD::ADD, DL, MVT::v16i8, Rdx, Hi); | |||
| 45942 | Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx, | |||
| 45943 | getZeroVector(MVT::v16i8, Subtarget, DAG, DL)); | |||
| 45944 | Rdx = DAG.getBitcast(MVT::v16i8, Rdx); | |||
| 45945 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); | |||
| 45946 | } | |||
| 45947 | ||||
| 45948 | // See if we can use vXi8 PSADBW add reduction for larger zext types. | |||
| 45949 | // If the source vector values are 0-255, then we can use PSADBW to | |||
| 45950 | // sum+zext v8i8 subvectors to vXi64, then perform the reduction. | |||
| 45951 | // TODO: See if its worth avoiding vXi16/i32 truncations? | |||
| 45952 | if (Opc == ISD::ADD && NumElts >= 4 && EltSizeInBits >= 16 && | |||
| 45953 | DAG.computeKnownBits(Rdx).getMaxValue().ule(255) && | |||
| 45954 | (EltSizeInBits == 16 || Rdx.getOpcode() == ISD::ZERO_EXTEND || | |||
| 45955 | Subtarget.hasAVX512())) { | |||
| 45956 | EVT ByteVT = VecVT.changeVectorElementType(MVT::i8); | |||
| 45957 | Rdx = DAG.getNode(ISD::TRUNCATE, DL, ByteVT, Rdx); | |||
| 45958 | if (ByteVT.getSizeInBits() < 128) | |||
| 45959 | Rdx = WidenToV16I8(Rdx, true); | |||
| 45960 | ||||
| 45961 | // Build the PSADBW, split as 128/256/512 bits for SSE/AVX2/AVX512BW. | |||
| 45962 | auto PSADBWBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 45963 | ArrayRef<SDValue> Ops) { | |||
| 45964 | MVT VT = MVT::getVectorVT(MVT::i64, Ops[0].getValueSizeInBits() / 64); | |||
| 45965 | SDValue Zero = DAG.getConstant(0, DL, Ops[0].getValueType()); | |||
| 45966 | return DAG.getNode(X86ISD::PSADBW, DL, VT, Ops[0], Zero); | |||
| 45967 | }; | |||
| 45968 | MVT SadVT = MVT::getVectorVT(MVT::i64, Rdx.getValueSizeInBits() / 64); | |||
| 45969 | Rdx = SplitOpsAndApply(DAG, Subtarget, DL, SadVT, {Rdx}, PSADBWBuilder); | |||
| 45970 | ||||
| 45971 | // TODO: We could truncate to vXi16/vXi32 before performing the reduction. | |||
| 45972 | while (Rdx.getValueSizeInBits() > 128) { | |||
| 45973 | SDValue Lo, Hi; | |||
| 45974 | std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL); | |||
| 45975 | VecVT = Lo.getValueType(); | |||
| 45976 | Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi); | |||
| 45977 | } | |||
| 45978 | assert(Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected")(static_cast <bool> (Rdx.getValueType() == MVT::v2i64 && "v2i64 reduction expected") ? void (0) : __assert_fail ("Rdx.getValueType() == MVT::v2i64 && \"v2i64 reduction expected\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 45978, __extension__ __PRETTY_FUNCTION__)); | |||
| 45979 | ||||
| 45980 | if (NumElts > 8) { | |||
| 45981 | SDValue RdxHi = DAG.getVectorShuffle(MVT::v2i64, DL, Rdx, Rdx, {1, -1}); | |||
| 45982 | Rdx = DAG.getNode(ISD::ADD, DL, MVT::v2i64, Rdx, RdxHi); | |||
| 45983 | } | |||
| 45984 | ||||
| 45985 | VecVT = MVT::getVectorVT(VT.getSimpleVT(), 128 / VT.getSizeInBits()); | |||
| 45986 | Rdx = DAG.getBitcast(VecVT, Rdx); | |||
| 45987 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); | |||
| 45988 | } | |||
| 45989 | ||||
| 45990 | // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize. | |||
| 45991 | if (!shouldUseHorizontalOp(true, DAG, Subtarget)) | |||
| 45992 | return SDValue(); | |||
| 45993 | ||||
| 45994 | unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD; | |||
| 45995 | ||||
| 45996 | // 256-bit horizontal instructions operate on 128-bit chunks rather than | |||
| 45997 | // across the whole vector, so we need an extract + hop preliminary stage. | |||
| 45998 | // This is the only step where the operands of the hop are not the same value. | |||
| 45999 | // TODO: We could extend this to handle 512-bit or even longer vectors. | |||
| 46000 | if (((VecVT == MVT::v16i16 || VecVT == MVT::v8i32) && Subtarget.hasSSSE3()) || | |||
| 46001 | ((VecVT == MVT::v8f32 || VecVT == MVT::v4f64) && Subtarget.hasSSE3())) { | |||
| 46002 | unsigned NumElts = VecVT.getVectorNumElements(); | |||
| 46003 | SDValue Hi = extract128BitVector(Rdx, NumElts / 2, DAG, DL); | |||
| 46004 | SDValue Lo = extract128BitVector(Rdx, 0, DAG, DL); | |||
| 46005 | Rdx = DAG.getNode(HorizOpcode, DL, Lo.getValueType(), Hi, Lo); | |||
| 46006 | VecVT = Rdx.getValueType(); | |||
| 46007 | } | |||
| 46008 | if (!((VecVT == MVT::v8i16 || VecVT == MVT::v4i32) && Subtarget.hasSSSE3()) && | |||
| 46009 | !((VecVT == MVT::v4f32 || VecVT == MVT::v2f64) && Subtarget.hasSSE3())) | |||
| 46010 | return SDValue(); | |||
| 46011 | ||||
| 46012 | // extract (add (shuf X), X), 0 --> extract (hadd X, X), 0 | |||
| 46013 | unsigned ReductionSteps = Log2_32(VecVT.getVectorNumElements()); | |||
| 46014 | for (unsigned i = 0; i != ReductionSteps; ++i) | |||
| 46015 | Rdx = DAG.getNode(HorizOpcode, DL, VecVT, Rdx, Rdx); | |||
| 46016 | ||||
| 46017 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index); | |||
| 46018 | } | |||
| 46019 | ||||
| 46020 | /// Detect vector gather/scatter index generation and convert it from being a | |||
| 46021 | /// bunch of shuffles and extracts into a somewhat faster sequence. | |||
| 46022 | /// For i686, the best sequence is apparently storing the value and loading | |||
| 46023 | /// scalars back, while for x64 we should use 64-bit extracts and shifts. | |||
| 46024 | static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, | |||
| 46025 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 46026 | const X86Subtarget &Subtarget) { | |||
| 46027 | if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) | |||
| 46028 | return NewOp; | |||
| 46029 | ||||
| 46030 | SDValue InputVector = N->getOperand(0); | |||
| 46031 | SDValue EltIdx = N->getOperand(1); | |||
| 46032 | auto *CIdx = dyn_cast<ConstantSDNode>(EltIdx); | |||
| 46033 | ||||
| 46034 | EVT SrcVT = InputVector.getValueType(); | |||
| 46035 | EVT VT = N->getValueType(0); | |||
| 46036 | SDLoc dl(InputVector); | |||
| 46037 | bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT; | |||
| 46038 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 46039 | unsigned NumEltBits = VT.getScalarSizeInBits(); | |||
| 46040 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 46041 | ||||
| 46042 | if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts)) | |||
| 46043 | return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); | |||
| 46044 | ||||
| 46045 | // Integer Constant Folding. | |||
| 46046 | if (CIdx && VT.isInteger()) { | |||
| 46047 | APInt UndefVecElts; | |||
| 46048 | SmallVector<APInt, 16> EltBits; | |||
| 46049 | unsigned VecEltBitWidth = SrcVT.getScalarSizeInBits(); | |||
| 46050 | if (getTargetConstantBitsFromNode(InputVector, VecEltBitWidth, UndefVecElts, | |||
| 46051 | EltBits, true, false)) { | |||
| 46052 | uint64_t Idx = CIdx->getZExtValue(); | |||
| 46053 | if (UndefVecElts[Idx]) | |||
| 46054 | return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT); | |||
| 46055 | return DAG.getConstant(EltBits[Idx].zext(NumEltBits), dl, VT); | |||
| 46056 | } | |||
| 46057 | ||||
| 46058 | // Convert extract_element(bitcast(<X x i1>) -> bitcast(extract_subvector()). | |||
| 46059 | // Improves lowering of bool masks on rust which splits them into byte array. | |||
| 46060 | if (InputVector.getOpcode() == ISD::BITCAST && (NumEltBits % 8) == 0) { | |||
| 46061 | SDValue Src = peekThroughBitcasts(InputVector); | |||
| 46062 | if (Src.getValueType().getScalarType() == MVT::i1 && | |||
| 46063 | TLI.isTypeLegal(Src.getValueType())) { | |||
| 46064 | MVT SubVT = MVT::getVectorVT(MVT::i1, NumEltBits); | |||
| 46065 | SDValue Sub = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Src, | |||
| 46066 | DAG.getIntPtrConstant(CIdx->getZExtValue() * NumEltBits, dl)); | |||
| 46067 | return DAG.getBitcast(VT, Sub); | |||
| 46068 | } | |||
| 46069 | } | |||
| 46070 | } | |||
| 46071 | ||||
| 46072 | if (IsPextr) { | |||
| 46073 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumEltBits), | |||
| 46074 | DCI)) | |||
| 46075 | return SDValue(N, 0); | |||
| 46076 | ||||
| 46077 | // PEXTR*(PINSR*(v, s, c), c) -> s (with implicit zext handling). | |||
| 46078 | if ((InputVector.getOpcode() == X86ISD::PINSRB || | |||
| 46079 | InputVector.getOpcode() == X86ISD::PINSRW) && | |||
| 46080 | InputVector.getOperand(2) == EltIdx) { | |||
| 46081 | assert(SrcVT == InputVector.getOperand(0).getValueType() &&(static_cast <bool> (SrcVT == InputVector.getOperand(0) .getValueType() && "Vector type mismatch") ? void (0) : __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__ __PRETTY_FUNCTION__)) | |||
| 46082 | "Vector type mismatch")(static_cast <bool> (SrcVT == InputVector.getOperand(0) .getValueType() && "Vector type mismatch") ? void (0) : __assert_fail ("SrcVT == InputVector.getOperand(0).getValueType() && \"Vector type mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46082, __extension__ __PRETTY_FUNCTION__)); | |||
| 46083 | SDValue Scl = InputVector.getOperand(1); | |||
| 46084 | Scl = DAG.getNode(ISD::TRUNCATE, dl, SrcVT.getScalarType(), Scl); | |||
| 46085 | return DAG.getZExtOrTrunc(Scl, dl, VT); | |||
| 46086 | } | |||
| 46087 | ||||
| 46088 | // TODO - Remove this once we can handle the implicit zero-extension of | |||
| 46089 | // X86ISD::PEXTRW/X86ISD::PEXTRB in combinePredicateReduction and | |||
| 46090 | // combineBasicSADPattern. | |||
| 46091 | return SDValue(); | |||
| 46092 | } | |||
| 46093 | ||||
| 46094 | // Detect mmx extraction of all bits as a i64. It works better as a bitcast. | |||
| 46095 | if (VT == MVT::i64 && SrcVT == MVT::v1i64 && | |||
| 46096 | InputVector.getOpcode() == ISD::BITCAST && | |||
| 46097 | InputVector.getOperand(0).getValueType() == MVT::x86mmx && | |||
| 46098 | isNullConstant(EltIdx) && InputVector.hasOneUse()) | |||
| 46099 | return DAG.getBitcast(VT, InputVector); | |||
| 46100 | ||||
| 46101 | // Detect mmx to i32 conversion through a v2i32 elt extract. | |||
| 46102 | if (VT == MVT::i32 && SrcVT == MVT::v2i32 && | |||
| 46103 | InputVector.getOpcode() == ISD::BITCAST && | |||
| 46104 | InputVector.getOperand(0).getValueType() == MVT::x86mmx && | |||
| 46105 | isNullConstant(EltIdx) && InputVector.hasOneUse()) | |||
| 46106 | return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, | |||
| 46107 | InputVector.getOperand(0)); | |||
| 46108 | ||||
| 46109 | // Check whether this extract is the root of a sum of absolute differences | |||
| 46110 | // pattern. This has to be done here because we really want it to happen | |||
| 46111 | // pre-legalization, | |||
| 46112 | if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget)) | |||
| 46113 | return SAD; | |||
| 46114 | ||||
| 46115 | if (SDValue VPDPBUSD = combineVPDPBUSDPattern(N, DAG, Subtarget)) | |||
| 46116 | return VPDPBUSD; | |||
| 46117 | ||||
| 46118 | // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK. | |||
| 46119 | if (SDValue Cmp = combinePredicateReduction(N, DAG, Subtarget)) | |||
| 46120 | return Cmp; | |||
| 46121 | ||||
| 46122 | // Attempt to replace min/max v8i16/v16i8 reductions with PHMINPOSUW. | |||
| 46123 | if (SDValue MinMax = combineMinMaxReduction(N, DAG, Subtarget)) | |||
| 46124 | return MinMax; | |||
| 46125 | ||||
| 46126 | // Attempt to optimize ADD/FADD/MUL reductions with HADD, promotion etc.. | |||
| 46127 | if (SDValue V = combineArithReduction(N, DAG, Subtarget)) | |||
| 46128 | return V; | |||
| 46129 | ||||
| 46130 | if (SDValue V = scalarizeExtEltFP(N, DAG, Subtarget)) | |||
| 46131 | return V; | |||
| 46132 | ||||
| 46133 | // Attempt to extract a i1 element by using MOVMSK to extract the signbits | |||
| 46134 | // and then testing the relevant element. | |||
| 46135 | // | |||
| 46136 | // Note that we only combine extracts on the *same* result number, i.e. | |||
| 46137 | // t0 = merge_values a0, a1, a2, a3 | |||
| 46138 | // i1 = extract_vector_elt t0, Constant:i64<2> | |||
| 46139 | // i1 = extract_vector_elt t0, Constant:i64<3> | |||
| 46140 | // but not | |||
| 46141 | // i1 = extract_vector_elt t0:1, Constant:i64<2> | |||
| 46142 | // since the latter would need its own MOVMSK. | |||
| 46143 | if (SrcVT.getScalarType() == MVT::i1) { | |||
| 46144 | bool IsVar = !CIdx; | |||
| 46145 | SmallVector<SDNode *, 16> BoolExtracts; | |||
| 46146 | unsigned ResNo = InputVector.getResNo(); | |||
| 46147 | auto IsBoolExtract = [&BoolExtracts, &ResNo, &IsVar](SDNode *Use) { | |||
| 46148 | if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 46149 | Use->getOperand(0).getResNo() == ResNo && | |||
| 46150 | Use->getValueType(0) == MVT::i1) { | |||
| 46151 | BoolExtracts.push_back(Use); | |||
| 46152 | IsVar |= !isa<ConstantSDNode>(Use->getOperand(1)); | |||
| 46153 | return true; | |||
| 46154 | } | |||
| 46155 | return false; | |||
| 46156 | }; | |||
| 46157 | // TODO: Can we drop the oneuse check for constant extracts? | |||
| 46158 | if (all_of(InputVector->uses(), IsBoolExtract) && | |||
| 46159 | (IsVar || BoolExtracts.size() > 1)) { | |||
| 46160 | EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts); | |||
| 46161 | if (SDValue BC = | |||
| 46162 | combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) { | |||
| 46163 | for (SDNode *Use : BoolExtracts) { | |||
| 46164 | // extractelement vXi1 X, MaskIdx --> ((movmsk X) & Mask) == Mask | |||
| 46165 | // Mask = 1 << MaskIdx | |||
| 46166 | SDValue MaskIdx = DAG.getZExtOrTrunc(Use->getOperand(1), dl, MVT::i8); | |||
| 46167 | SDValue MaskBit = DAG.getConstant(1, dl, BCVT); | |||
| 46168 | SDValue Mask = DAG.getNode(ISD::SHL, dl, BCVT, MaskBit, MaskIdx); | |||
| 46169 | SDValue Res = DAG.getNode(ISD::AND, dl, BCVT, BC, Mask); | |||
| 46170 | Res = DAG.getSetCC(dl, MVT::i1, Res, Mask, ISD::SETEQ); | |||
| 46171 | DCI.CombineTo(Use, Res); | |||
| 46172 | } | |||
| 46173 | return SDValue(N, 0); | |||
| 46174 | } | |||
| 46175 | } | |||
| 46176 | } | |||
| 46177 | ||||
| 46178 | // If this extract is from a loaded vector value and will be used as an | |||
| 46179 | // integer, that requires a potentially expensive XMM -> GPR transfer. | |||
| 46180 | // Additionally, if we can convert to a scalar integer load, that will likely | |||
| 46181 | // be folded into a subsequent integer op. | |||
| 46182 | // Note: Unlike the related fold for this in DAGCombiner, this is not limited | |||
| 46183 | // to a single-use of the loaded vector. For the reasons above, we | |||
| 46184 | // expect this to be profitable even if it creates an extra load. | |||
| 46185 | bool LikelyUsedAsVector = any_of(N->uses(), [](SDNode *Use) { | |||
| 46186 | return Use->getOpcode() == ISD::STORE || | |||
| 46187 | Use->getOpcode() == ISD::INSERT_VECTOR_ELT || | |||
| 46188 | Use->getOpcode() == ISD::SCALAR_TO_VECTOR; | |||
| 46189 | }); | |||
| 46190 | auto *LoadVec = dyn_cast<LoadSDNode>(InputVector); | |||
| 46191 | if (LoadVec && CIdx && ISD::isNormalLoad(LoadVec) && VT.isInteger() && | |||
| 46192 | SrcVT.getVectorElementType() == VT && DCI.isAfterLegalizeDAG() && | |||
| 46193 | !LikelyUsedAsVector && LoadVec->isSimple()) { | |||
| 46194 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 46195 | SDValue NewPtr = | |||
| 46196 | TLI.getVectorElementPointer(DAG, LoadVec->getBasePtr(), SrcVT, EltIdx); | |||
| 46197 | unsigned PtrOff = VT.getSizeInBits() * CIdx->getZExtValue() / 8; | |||
| 46198 | MachinePointerInfo MPI = LoadVec->getPointerInfo().getWithOffset(PtrOff); | |||
| 46199 | Align Alignment = commonAlignment(LoadVec->getAlign(), PtrOff); | |||
| 46200 | SDValue Load = | |||
| 46201 | DAG.getLoad(VT, dl, LoadVec->getChain(), NewPtr, MPI, Alignment, | |||
| 46202 | LoadVec->getMemOperand()->getFlags(), LoadVec->getAAInfo()); | |||
| 46203 | DAG.makeEquivalentMemoryOrdering(LoadVec, Load); | |||
| 46204 | return Load; | |||
| 46205 | } | |||
| 46206 | ||||
| 46207 | return SDValue(); | |||
| 46208 | } | |||
| 46209 | ||||
| 46210 | // Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). | |||
| 46211 | // This is more or less the reverse of combineBitcastvxi1. | |||
| 46212 | static SDValue combineToExtendBoolVectorInReg( | |||
| 46213 | unsigned Opcode, const SDLoc &DL, EVT VT, SDValue N0, SelectionDAG &DAG, | |||
| 46214 | TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { | |||
| 46215 | if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && | |||
| 46216 | Opcode != ISD::ANY_EXTEND) | |||
| 46217 | return SDValue(); | |||
| 46218 | if (!DCI.isBeforeLegalizeOps()) | |||
| 46219 | return SDValue(); | |||
| 46220 | if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) | |||
| 46221 | return SDValue(); | |||
| 46222 | ||||
| 46223 | EVT SVT = VT.getScalarType(); | |||
| 46224 | EVT InSVT = N0.getValueType().getScalarType(); | |||
| 46225 | unsigned EltSizeInBits = SVT.getSizeInBits(); | |||
| 46226 | ||||
| 46227 | // Input type must be extending a bool vector (bit-casted from a scalar | |||
| 46228 | // integer) to legal integer types. | |||
| 46229 | if (!VT.isVector()) | |||
| 46230 | return SDValue(); | |||
| 46231 | if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) | |||
| 46232 | return SDValue(); | |||
| 46233 | if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) | |||
| 46234 | return SDValue(); | |||
| 46235 | ||||
| 46236 | SDValue N00 = N0.getOperand(0); | |||
| 46237 | EVT SclVT = N00.getValueType(); | |||
| 46238 | if (!SclVT.isScalarInteger()) | |||
| 46239 | return SDValue(); | |||
| 46240 | ||||
| 46241 | SDValue Vec; | |||
| 46242 | SmallVector<int> ShuffleMask; | |||
| 46243 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 46244 | assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size")(static_cast <bool> (NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size") ? void (0) : __assert_fail ("NumElts == SclVT.getSizeInBits() && \"Unexpected bool vector size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46244, __extension__ __PRETTY_FUNCTION__)); | |||
| 46245 | ||||
| 46246 | // Broadcast the scalar integer to the vector elements. | |||
| 46247 | if (NumElts > EltSizeInBits) { | |||
| 46248 | // If the scalar integer is greater than the vector element size, then we | |||
| 46249 | // must split it down into sub-sections for broadcasting. For example: | |||
| 46250 | // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. | |||
| 46251 | // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. | |||
| 46252 | assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale")(static_cast <bool> ((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale") ? void (0) : __assert_fail ("(NumElts % EltSizeInBits) == 0 && \"Unexpected integer scale\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46252, __extension__ __PRETTY_FUNCTION__)); | |||
| 46253 | unsigned Scale = NumElts / EltSizeInBits; | |||
| 46254 | EVT BroadcastVT = EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); | |||
| 46255 | Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); | |||
| 46256 | Vec = DAG.getBitcast(VT, Vec); | |||
| 46257 | ||||
| 46258 | for (unsigned i = 0; i != Scale; ++i) | |||
| 46259 | ShuffleMask.append(EltSizeInBits, i); | |||
| 46260 | Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); | |||
| 46261 | } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits && | |||
| 46262 | (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) { | |||
| 46263 | // If we have register broadcast instructions, use the scalar size as the | |||
| 46264 | // element type for the shuffle. Then cast to the wider element type. The | |||
| 46265 | // widened bits won't be used, and this might allow the use of a broadcast | |||
| 46266 | // load. | |||
| 46267 | assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale")(static_cast <bool> ((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale") ? void (0) : __assert_fail ("(EltSizeInBits % NumElts) == 0 && \"Unexpected integer scale\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46267, __extension__ __PRETTY_FUNCTION__)); | |||
| 46268 | unsigned Scale = EltSizeInBits / NumElts; | |||
| 46269 | EVT BroadcastVT = | |||
| 46270 | EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale); | |||
| 46271 | Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); | |||
| 46272 | ShuffleMask.append(NumElts * Scale, 0); | |||
| 46273 | Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask); | |||
| 46274 | Vec = DAG.getBitcast(VT, Vec); | |||
| 46275 | } else { | |||
| 46276 | // For smaller scalar integers, we can simply any-extend it to the vector | |||
| 46277 | // element size (we don't care about the upper bits) and broadcast it to all | |||
| 46278 | // elements. | |||
| 46279 | SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); | |||
| 46280 | Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); | |||
| 46281 | ShuffleMask.append(NumElts, 0); | |||
| 46282 | Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); | |||
| 46283 | } | |||
| 46284 | ||||
| 46285 | // Now, mask the relevant bit in each element. | |||
| 46286 | SmallVector<SDValue, 32> Bits; | |||
| 46287 | for (unsigned i = 0; i != NumElts; ++i) { | |||
| 46288 | int BitIdx = (i % EltSizeInBits); | |||
| 46289 | APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); | |||
| 46290 | Bits.push_back(DAG.getConstant(Bit, DL, SVT)); | |||
| 46291 | } | |||
| 46292 | SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); | |||
| 46293 | Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); | |||
| 46294 | ||||
| 46295 | // Compare against the bitmask and extend the result. | |||
| 46296 | EVT CCVT = VT.changeVectorElementType(MVT::i1); | |||
| 46297 | Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); | |||
| 46298 | Vec = DAG.getSExtOrTrunc(Vec, DL, VT); | |||
| 46299 | ||||
| 46300 | // For SEXT, this is now done, otherwise shift the result down for | |||
| 46301 | // zero-extension. | |||
| 46302 | if (Opcode == ISD::SIGN_EXTEND) | |||
| 46303 | return Vec; | |||
| 46304 | return DAG.getNode(ISD::SRL, DL, VT, Vec, | |||
| 46305 | DAG.getConstant(EltSizeInBits - 1, DL, VT)); | |||
| 46306 | } | |||
| 46307 | ||||
| 46308 | /// If a vector select has an operand that is -1 or 0, try to simplify the | |||
| 46309 | /// select to a bitwise logic operation. | |||
| 46310 | /// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? | |||
| 46311 | static SDValue | |||
| 46312 | combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, | |||
| 46313 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 46314 | const X86Subtarget &Subtarget) { | |||
| 46315 | SDValue Cond = N->getOperand(0); | |||
| 46316 | SDValue LHS = N->getOperand(1); | |||
| 46317 | SDValue RHS = N->getOperand(2); | |||
| 46318 | EVT VT = LHS.getValueType(); | |||
| 46319 | EVT CondVT = Cond.getValueType(); | |||
| 46320 | SDLoc DL(N); | |||
| 46321 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 46322 | ||||
| 46323 | if (N->getOpcode() != ISD::VSELECT) | |||
| 46324 | return SDValue(); | |||
| 46325 | ||||
| 46326 | assert(CondVT.isVector() && "Vector select expects a vector selector!")(static_cast <bool> (CondVT.isVector() && "Vector select expects a vector selector!" ) ? void (0) : __assert_fail ("CondVT.isVector() && \"Vector select expects a vector selector!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46326, __extension__ __PRETTY_FUNCTION__)); | |||
| 46327 | ||||
| 46328 | // TODO: Use isNullOrNullSplat() to distinguish constants with undefs? | |||
| 46329 | // TODO: Can we assert that both operands are not zeros (because that should | |||
| 46330 | // get simplified at node creation time)? | |||
| 46331 | bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); | |||
| 46332 | bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); | |||
| 46333 | ||||
| 46334 | // If both inputs are 0/undef, create a complete zero vector. | |||
| 46335 | // FIXME: As noted above this should be handled by DAGCombiner/getNode. | |||
| 46336 | if (TValIsAllZeros && FValIsAllZeros) { | |||
| 46337 | if (VT.isFloatingPoint()) | |||
| 46338 | return DAG.getConstantFP(0.0, DL, VT); | |||
| 46339 | return DAG.getConstant(0, DL, VT); | |||
| 46340 | } | |||
| 46341 | ||||
| 46342 | // To use the condition operand as a bitwise mask, it must have elements that | |||
| 46343 | // are the same size as the select elements. Ie, the condition operand must | |||
| 46344 | // have already been promoted from the IR select condition type <N x i1>. | |||
| 46345 | // Don't check if the types themselves are equal because that excludes | |||
| 46346 | // vector floating-point selects. | |||
| 46347 | if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits()) | |||
| 46348 | return SDValue(); | |||
| 46349 | ||||
| 46350 | // Try to invert the condition if true value is not all 1s and false value is | |||
| 46351 | // not all 0s. Only do this if the condition has one use. | |||
| 46352 | bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); | |||
| 46353 | if (!TValIsAllOnes && !FValIsAllZeros && Cond.hasOneUse() && | |||
| 46354 | // Check if the selector will be produced by CMPP*/PCMP*. | |||
| 46355 | Cond.getOpcode() == ISD::SETCC && | |||
| 46356 | // Check if SETCC has already been promoted. | |||
| 46357 | TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == | |||
| 46358 | CondVT) { | |||
| 46359 | bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); | |||
| 46360 | ||||
| 46361 | if (TValIsAllZeros || FValIsAllOnes) { | |||
| 46362 | SDValue CC = Cond.getOperand(2); | |||
| 46363 | ISD::CondCode NewCC = ISD::getSetCCInverse( | |||
| 46364 | cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType()); | |||
| 46365 | Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1), | |||
| 46366 | NewCC); | |||
| 46367 | std::swap(LHS, RHS); | |||
| 46368 | TValIsAllOnes = FValIsAllOnes; | |||
| 46369 | FValIsAllZeros = TValIsAllZeros; | |||
| 46370 | } | |||
| 46371 | } | |||
| 46372 | ||||
| 46373 | // Cond value must be 'sign splat' to be converted to a logical op. | |||
| 46374 | if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits()) | |||
| 46375 | return SDValue(); | |||
| 46376 | ||||
| 46377 | // vselect Cond, 111..., 000... -> Cond | |||
| 46378 | if (TValIsAllOnes && FValIsAllZeros) | |||
| 46379 | return DAG.getBitcast(VT, Cond); | |||
| 46380 | ||||
| 46381 | if (!TLI.isTypeLegal(CondVT)) | |||
| 46382 | return SDValue(); | |||
| 46383 | ||||
| 46384 | // vselect Cond, 111..., X -> or Cond, X | |||
| 46385 | if (TValIsAllOnes) { | |||
| 46386 | SDValue CastRHS = DAG.getBitcast(CondVT, RHS); | |||
| 46387 | SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS); | |||
| 46388 | return DAG.getBitcast(VT, Or); | |||
| 46389 | } | |||
| 46390 | ||||
| 46391 | // vselect Cond, X, 000... -> and Cond, X | |||
| 46392 | if (FValIsAllZeros) { | |||
| 46393 | SDValue CastLHS = DAG.getBitcast(CondVT, LHS); | |||
| 46394 | SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS); | |||
| 46395 | return DAG.getBitcast(VT, And); | |||
| 46396 | } | |||
| 46397 | ||||
| 46398 | // vselect Cond, 000..., X -> andn Cond, X | |||
| 46399 | if (TValIsAllZeros) { | |||
| 46400 | SDValue CastRHS = DAG.getBitcast(CondVT, RHS); | |||
| 46401 | SDValue AndN; | |||
| 46402 | // The canonical form differs for i1 vectors - x86andnp is not used | |||
| 46403 | if (CondVT.getScalarType() == MVT::i1) | |||
| 46404 | AndN = DAG.getNode(ISD::AND, DL, CondVT, DAG.getNOT(DL, Cond, CondVT), | |||
| 46405 | CastRHS); | |||
| 46406 | else | |||
| 46407 | AndN = DAG.getNode(X86ISD::ANDNP, DL, CondVT, Cond, CastRHS); | |||
| 46408 | return DAG.getBitcast(VT, AndN); | |||
| 46409 | } | |||
| 46410 | ||||
| 46411 | return SDValue(); | |||
| 46412 | } | |||
| 46413 | ||||
| 46414 | /// If both arms of a vector select are concatenated vectors, split the select, | |||
| 46415 | /// and concatenate the result to eliminate a wide (256-bit) vector instruction: | |||
| 46416 | /// vselect Cond, (concat T0, T1), (concat F0, F1) --> | |||
| 46417 | /// concat (vselect (split Cond), T0, F0), (vselect (split Cond), T1, F1) | |||
| 46418 | static SDValue narrowVectorSelect(SDNode *N, SelectionDAG &DAG, | |||
| 46419 | const X86Subtarget &Subtarget) { | |||
| 46420 | unsigned Opcode = N->getOpcode(); | |||
| 46421 | if (Opcode != X86ISD::BLENDV && Opcode != ISD::VSELECT) | |||
| 46422 | return SDValue(); | |||
| 46423 | ||||
| 46424 | // TODO: Split 512-bit vectors too? | |||
| 46425 | EVT VT = N->getValueType(0); | |||
| 46426 | if (!VT.is256BitVector()) | |||
| 46427 | return SDValue(); | |||
| 46428 | ||||
| 46429 | // TODO: Split as long as any 2 of the 3 operands are concatenated? | |||
| 46430 | SDValue Cond = N->getOperand(0); | |||
| 46431 | SDValue TVal = N->getOperand(1); | |||
| 46432 | SDValue FVal = N->getOperand(2); | |||
| 46433 | SmallVector<SDValue, 4> CatOpsT, CatOpsF; | |||
| 46434 | if (!TVal.hasOneUse() || !FVal.hasOneUse() || | |||
| 46435 | !collectConcatOps(TVal.getNode(), CatOpsT, DAG) || | |||
| 46436 | !collectConcatOps(FVal.getNode(), CatOpsF, DAG)) | |||
| 46437 | return SDValue(); | |||
| 46438 | ||||
| 46439 | auto makeBlend = [Opcode](SelectionDAG &DAG, const SDLoc &DL, | |||
| 46440 | ArrayRef<SDValue> Ops) { | |||
| 46441 | return DAG.getNode(Opcode, DL, Ops[1].getValueType(), Ops); | |||
| 46442 | }; | |||
| 46443 | return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { Cond, TVal, FVal }, | |||
| 46444 | makeBlend, /*CheckBWI*/ false); | |||
| 46445 | } | |||
| 46446 | ||||
| 46447 | static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { | |||
| 46448 | SDValue Cond = N->getOperand(0); | |||
| 46449 | SDValue LHS = N->getOperand(1); | |||
| 46450 | SDValue RHS = N->getOperand(2); | |||
| 46451 | SDLoc DL(N); | |||
| 46452 | ||||
| 46453 | auto *TrueC = dyn_cast<ConstantSDNode>(LHS); | |||
| 46454 | auto *FalseC = dyn_cast<ConstantSDNode>(RHS); | |||
| 46455 | if (!TrueC || !FalseC) | |||
| 46456 | return SDValue(); | |||
| 46457 | ||||
| 46458 | // Don't do this for crazy integer types. | |||
| 46459 | EVT VT = N->getValueType(0); | |||
| 46460 | if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) | |||
| 46461 | return SDValue(); | |||
| 46462 | ||||
| 46463 | // We're going to use the condition bit in math or logic ops. We could allow | |||
| 46464 | // this with a wider condition value (post-legalization it becomes an i8), | |||
| 46465 | // but if nothing is creating selects that late, it doesn't matter. | |||
| 46466 | if (Cond.getValueType() != MVT::i1) | |||
| 46467 | return SDValue(); | |||
| 46468 | ||||
| 46469 | // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by | |||
| 46470 | // 3, 5, or 9 with i32/i64, so those get transformed too. | |||
| 46471 | // TODO: For constants that overflow or do not differ by power-of-2 or small | |||
| 46472 | // multiplier, convert to 'and' + 'add'. | |||
| 46473 | const APInt &TrueVal = TrueC->getAPIntValue(); | |||
| 46474 | const APInt &FalseVal = FalseC->getAPIntValue(); | |||
| 46475 | ||||
| 46476 | // We have a more efficient lowering for "(X == 0) ? Y : -1" using SBB. | |||
| 46477 | if ((TrueVal.isAllOnes() || FalseVal.isAllOnes()) && | |||
| 46478 | Cond.getOpcode() == ISD::SETCC && isNullConstant(Cond.getOperand(1))) { | |||
| 46479 | ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); | |||
| 46480 | if (CC == ISD::SETEQ || CC == ISD::SETNE) | |||
| 46481 | return SDValue(); | |||
| 46482 | } | |||
| 46483 | ||||
| 46484 | bool OV; | |||
| 46485 | APInt Diff = TrueVal.ssub_ov(FalseVal, OV); | |||
| 46486 | if (OV) | |||
| 46487 | return SDValue(); | |||
| 46488 | ||||
| 46489 | APInt AbsDiff = Diff.abs(); | |||
| 46490 | if (AbsDiff.isPowerOf2() || | |||
| 46491 | ((VT == MVT::i32 || VT == MVT::i64) && | |||
| 46492 | (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) { | |||
| 46493 | ||||
| 46494 | // We need a positive multiplier constant for shift/LEA codegen. The 'not' | |||
| 46495 | // of the condition can usually be folded into a compare predicate, but even | |||
| 46496 | // without that, the sequence should be cheaper than a CMOV alternative. | |||
| 46497 | if (TrueVal.slt(FalseVal)) { | |||
| 46498 | Cond = DAG.getNOT(DL, Cond, MVT::i1); | |||
| 46499 | std::swap(TrueC, FalseC); | |||
| 46500 | } | |||
| 46501 | ||||
| 46502 | // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC | |||
| 46503 | SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); | |||
| 46504 | ||||
| 46505 | // Multiply condition by the difference if non-one. | |||
| 46506 | if (!AbsDiff.isOne()) | |||
| 46507 | R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT)); | |||
| 46508 | ||||
| 46509 | // Add the base if non-zero. | |||
| 46510 | if (!FalseC->isZero()) | |||
| 46511 | R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0)); | |||
| 46512 | ||||
| 46513 | return R; | |||
| 46514 | } | |||
| 46515 | ||||
| 46516 | return SDValue(); | |||
| 46517 | } | |||
| 46518 | ||||
| 46519 | /// If this is a *dynamic* select (non-constant condition) and we can match | |||
| 46520 | /// this node with one of the variable blend instructions, restructure the | |||
| 46521 | /// condition so that blends can use the high (sign) bit of each element. | |||
| 46522 | /// This function will also call SimplifyDemandedBits on already created | |||
| 46523 | /// BLENDV to perform additional simplifications. | |||
| 46524 | static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG, | |||
| 46525 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 46526 | const X86Subtarget &Subtarget) { | |||
| 46527 | SDValue Cond = N->getOperand(0); | |||
| 46528 | if ((N->getOpcode() != ISD::VSELECT && | |||
| 46529 | N->getOpcode() != X86ISD::BLENDV) || | |||
| 46530 | ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) | |||
| 46531 | return SDValue(); | |||
| 46532 | ||||
| 46533 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 46534 | unsigned BitWidth = Cond.getScalarValueSizeInBits(); | |||
| 46535 | EVT VT = N->getValueType(0); | |||
| 46536 | ||||
| 46537 | // We can only handle the cases where VSELECT is directly legal on the | |||
| 46538 | // subtarget. We custom lower VSELECT nodes with constant conditions and | |||
| 46539 | // this makes it hard to see whether a dynamic VSELECT will correctly | |||
| 46540 | // lower, so we both check the operation's status and explicitly handle the | |||
| 46541 | // cases where a *dynamic* blend will fail even though a constant-condition | |||
| 46542 | // blend could be custom lowered. | |||
| 46543 | // FIXME: We should find a better way to handle this class of problems. | |||
| 46544 | // Potentially, we should combine constant-condition vselect nodes | |||
| 46545 | // pre-legalization into shuffles and not mark as many types as custom | |||
| 46546 | // lowered. | |||
| 46547 | if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) | |||
| 46548 | return SDValue(); | |||
| 46549 | // FIXME: We don't support i16-element blends currently. We could and | |||
| 46550 | // should support them by making *all* the bits in the condition be set | |||
| 46551 | // rather than just the high bit and using an i8-element blend. | |||
| 46552 | if (VT.getVectorElementType() == MVT::i16) | |||
| 46553 | return SDValue(); | |||
| 46554 | // Dynamic blending was only available from SSE4.1 onward. | |||
| 46555 | if (VT.is128BitVector() && !Subtarget.hasSSE41()) | |||
| 46556 | return SDValue(); | |||
| 46557 | // Byte blends are only available in AVX2 | |||
| 46558 | if (VT == MVT::v32i8 && !Subtarget.hasAVX2()) | |||
| 46559 | return SDValue(); | |||
| 46560 | // There are no 512-bit blend instructions that use sign bits. | |||
| 46561 | if (VT.is512BitVector()) | |||
| 46562 | return SDValue(); | |||
| 46563 | ||||
| 46564 | // Don't optimize before the condition has been transformed to a legal type | |||
| 46565 | // and don't ever optimize vector selects that map to AVX512 mask-registers. | |||
| 46566 | if (BitWidth < 8 || BitWidth > 64) | |||
| 46567 | return SDValue(); | |||
| 46568 | ||||
| 46569 | auto OnlyUsedAsSelectCond = [](SDValue Cond) { | |||
| 46570 | for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end(); | |||
| 46571 | UI != UE; ++UI) | |||
| 46572 | if ((UI->getOpcode() != ISD::VSELECT && | |||
| 46573 | UI->getOpcode() != X86ISD::BLENDV) || | |||
| 46574 | UI.getOperandNo() != 0) | |||
| 46575 | return false; | |||
| 46576 | ||||
| 46577 | return true; | |||
| 46578 | }; | |||
| 46579 | ||||
| 46580 | APInt DemandedBits(APInt::getSignMask(BitWidth)); | |||
| 46581 | ||||
| 46582 | if (OnlyUsedAsSelectCond(Cond)) { | |||
| 46583 | KnownBits Known; | |||
| 46584 | TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), | |||
| 46585 | !DCI.isBeforeLegalizeOps()); | |||
| 46586 | if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true)) | |||
| 46587 | return SDValue(); | |||
| 46588 | ||||
| 46589 | // If we changed the computation somewhere in the DAG, this change will | |||
| 46590 | // affect all users of Cond. Update all the nodes so that we do not use | |||
| 46591 | // the generic VSELECT anymore. Otherwise, we may perform wrong | |||
| 46592 | // optimizations as we messed with the actual expectation for the vector | |||
| 46593 | // boolean values. | |||
| 46594 | for (SDNode *U : Cond->uses()) { | |||
| 46595 | if (U->getOpcode() == X86ISD::BLENDV) | |||
| 46596 | continue; | |||
| 46597 | ||||
| 46598 | SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0), | |||
| 46599 | Cond, U->getOperand(1), U->getOperand(2)); | |||
| 46600 | DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB); | |||
| 46601 | DCI.AddToWorklist(U); | |||
| 46602 | } | |||
| 46603 | DCI.CommitTargetLoweringOpt(TLO); | |||
| 46604 | return SDValue(N, 0); | |||
| 46605 | } | |||
| 46606 | ||||
| 46607 | // Otherwise we can still at least try to simplify multiple use bits. | |||
| 46608 | if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG)) | |||
| 46609 | return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V, | |||
| 46610 | N->getOperand(1), N->getOperand(2)); | |||
| 46611 | ||||
| 46612 | return SDValue(); | |||
| 46613 | } | |||
| 46614 | ||||
| 46615 | // Try to match: | |||
| 46616 | // (or (and (M, (sub 0, X)), (pandn M, X))) | |||
| 46617 | // which is a special case of: | |||
| 46618 | // (select M, (sub 0, X), X) | |||
| 46619 | // Per: | |||
| 46620 | // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate | |||
| 46621 | // We know that, if fNegate is 0 or 1: | |||
| 46622 | // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate) | |||
| 46623 | // | |||
| 46624 | // Here, we have a mask, M (all 1s or 0), and, similarly, we know that: | |||
| 46625 | // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1)) | |||
| 46626 | // ( M ? -X : X) == ((X ^ M ) + (M & 1)) | |||
| 46627 | // This lets us transform our vselect to: | |||
| 46628 | // (add (xor X, M), (and M, 1)) | |||
| 46629 | // And further to: | |||
| 46630 | // (sub (xor X, M), M) | |||
| 46631 | static SDValue combineLogicBlendIntoConditionalNegate( | |||
| 46632 | EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL, | |||
| 46633 | SelectionDAG &DAG, const X86Subtarget &Subtarget) { | |||
| 46634 | EVT MaskVT = Mask.getValueType(); | |||
| 46635 | assert(MaskVT.isInteger() &&(static_cast <bool> (MaskVT.isInteger() && DAG. ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__ __PRETTY_FUNCTION__)) | |||
| 46636 | DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&(static_cast <bool> (MaskVT.isInteger() && DAG. ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__ __PRETTY_FUNCTION__)) | |||
| 46637 | "Mask must be zero/all-bits")(static_cast <bool> (MaskVT.isInteger() && DAG. ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && "Mask must be zero/all-bits") ? void (0) : __assert_fail ("MaskVT.isInteger() && DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() && \"Mask must be zero/all-bits\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 46637, __extension__ __PRETTY_FUNCTION__)); | |||
| 46638 | ||||
| 46639 | if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT) | |||
| 46640 | return SDValue(); | |||
| 46641 | if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) | |||
| 46642 | return SDValue(); | |||
| 46643 | ||||
| 46644 | auto IsNegV = [](SDNode *N, SDValue V) { | |||
| 46645 | return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && | |||
| 46646 | ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); | |||
| 46647 | }; | |||
| 46648 | ||||
| 46649 | SDValue V; | |||
| 46650 | if (IsNegV(Y.getNode(), X)) | |||
| 46651 | V = X; | |||
| 46652 | else if (IsNegV(X.getNode(), Y)) | |||
| 46653 | V = Y; | |||
| 46654 | else | |||
| 46655 | return SDValue(); | |||
| 46656 | ||||
| 46657 | SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); | |||
| 46658 | SDValue SubOp2 = Mask; | |||
| 46659 | ||||
| 46660 | // If the negate was on the false side of the select, then | |||
| 46661 | // the operands of the SUB need to be swapped. PR 27251. | |||
| 46662 | // This is because the pattern being matched above is | |||
| 46663 | // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M) | |||
| 46664 | // but if the pattern matched was | |||
| 46665 | // (vselect M, X, (sub (0, X))), that is really negation of the pattern | |||
| 46666 | // above, -(vselect M, (sub 0, X), X), and therefore the replacement | |||
| 46667 | // pattern also needs to be a negation of the replacement pattern above. | |||
| 46668 | // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the | |||
| 46669 | // sub accomplishes the negation of the replacement pattern. | |||
| 46670 | if (V == Y) | |||
| 46671 | std::swap(SubOp1, SubOp2); | |||
| 46672 | ||||
| 46673 | SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); | |||
| 46674 | return DAG.getBitcast(VT, Res); | |||
| 46675 | } | |||
| 46676 | ||||
| 46677 | /// Do target-specific dag combines on SELECT and VSELECT nodes. | |||
| 46678 | static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, | |||
| 46679 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 46680 | const X86Subtarget &Subtarget) { | |||
| 46681 | SDLoc DL(N); | |||
| 46682 | SDValue Cond = N->getOperand(0); | |||
| 46683 | SDValue LHS = N->getOperand(1); | |||
| 46684 | SDValue RHS = N->getOperand(2); | |||
| 46685 | ||||
| 46686 | // Try simplification again because we use this function to optimize | |||
| 46687 | // BLENDV nodes that are not handled by the generic combiner. | |||
| 46688 | if (SDValue V = DAG.simplifySelect(Cond, LHS, RHS)) | |||
| 46689 | return V; | |||
| 46690 | ||||
| 46691 | EVT VT = LHS.getValueType(); | |||
| 46692 | EVT CondVT = Cond.getValueType(); | |||
| 46693 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 46694 | bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()); | |||
| 46695 | ||||
| 46696 | // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M). | |||
| 46697 | // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT | |||
| 46698 | // can't catch, plus vXi8 cases where we'd likely end up with BLENDV. | |||
| 46699 | if (CondVT.isVector() && CondVT.isInteger() && | |||
| 46700 | CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() && | |||
| 46701 | (!CondConstantVector || CondVT.getScalarType() == MVT::i8) && | |||
| 46702 | DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits()) | |||
| 46703 | if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS, | |||
| 46704 | DL, DAG, Subtarget)) | |||
| 46705 | return V; | |||
| 46706 | ||||
| 46707 | // Convert vselects with constant condition into shuffles. | |||
| 46708 | if (CondConstantVector && DCI.isBeforeLegalizeOps() && | |||
| 46709 | (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) { | |||
| 46710 | SmallVector<int, 64> Mask; | |||
| 46711 | if (createShuffleMaskFromVSELECT(Mask, Cond, | |||
| 46712 | N->getOpcode() == X86ISD::BLENDV)) | |||
| 46713 | return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); | |||
| 46714 | } | |||
| 46715 | ||||
| 46716 | // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) | |||
| 46717 | // by forcing the unselected elements to zero. | |||
| 46718 | // TODO: Can we handle more shuffles with this? | |||
| 46719 | if (N->getOpcode() == ISD::VSELECT && CondVT.isVector() && | |||
| 46720 | LHS.getOpcode() == X86ISD::PSHUFB && RHS.getOpcode() == X86ISD::PSHUFB && | |||
| 46721 | LHS.hasOneUse() && RHS.hasOneUse()) { | |||
| 46722 | MVT SimpleVT = VT.getSimpleVT(); | |||
| 46723 | SmallVector<SDValue, 1> LHSOps, RHSOps; | |||
| 46724 | SmallVector<int, 64> LHSMask, RHSMask, CondMask; | |||
| 46725 | if (createShuffleMaskFromVSELECT(CondMask, Cond) && | |||
| 46726 | getTargetShuffleMask(LHS.getNode(), SimpleVT, true, LHSOps, LHSMask) && | |||
| 46727 | getTargetShuffleMask(RHS.getNode(), SimpleVT, true, RHSOps, RHSMask)) { | |||
| 46728 | int NumElts = VT.getVectorNumElements(); | |||
| 46729 | for (int i = 0; i != NumElts; ++i) { | |||
| 46730 | // getConstVector sets negative shuffle mask values as undef, so ensure | |||
| 46731 | // we hardcode SM_SentinelZero values to zero (0x80). | |||
| 46732 | if (CondMask[i] < NumElts) { | |||
| 46733 | LHSMask[i] = isUndefOrZero(LHSMask[i]) ? 0x80 : LHSMask[i]; | |||
| 46734 | RHSMask[i] = 0x80; | |||
| 46735 | } else { | |||
| 46736 | LHSMask[i] = 0x80; | |||
| 46737 | RHSMask[i] = isUndefOrZero(RHSMask[i]) ? 0x80 : RHSMask[i]; | |||
| 46738 | } | |||
| 46739 | } | |||
| 46740 | LHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, LHS.getOperand(0), | |||
| 46741 | getConstVector(LHSMask, SimpleVT, DAG, DL, true)); | |||
| 46742 | RHS = DAG.getNode(X86ISD::PSHUFB, DL, VT, RHS.getOperand(0), | |||
| 46743 | getConstVector(RHSMask, SimpleVT, DAG, DL, true)); | |||
| 46744 | return DAG.getNode(ISD::OR, DL, VT, LHS, RHS); | |||
| 46745 | } | |||
| 46746 | } | |||
| 46747 | ||||
| 46748 | // If we have SSE[12] support, try to form min/max nodes. SSE min/max | |||
| 46749 | // instructions match the semantics of the common C idiom x<y?x:y but not | |||
| 46750 | // x<=y?x:y, because of how they handle negative zero (which can be | |||
| 46751 | // ignored in unsafe-math mode). | |||
| 46752 | // We also try to create v2f32 min/max nodes, which we later widen to v4f32. | |||
| 46753 | if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() && | |||
| 46754 | VT != MVT::f80 && VT != MVT::f128 && !isSoftFP16(VT, Subtarget) && | |||
| 46755 | (TLI.isTypeLegal(VT) || VT == MVT::v2f32) && | |||
| 46756 | (Subtarget.hasSSE2() || | |||
| 46757 | (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) { | |||
| 46758 | ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); | |||
| 46759 | ||||
| 46760 | unsigned Opcode = 0; | |||
| 46761 | // Check for x CC y ? x : y. | |||
| 46762 | if (DAG.isEqualTo(LHS, Cond.getOperand(0)) && | |||
| 46763 | DAG.isEqualTo(RHS, Cond.getOperand(1))) { | |||
| 46764 | switch (CC) { | |||
| 46765 | default: break; | |||
| 46766 | case ISD::SETULT: | |||
| 46767 | // Converting this to a min would handle NaNs incorrectly, and swapping | |||
| 46768 | // the operands would cause it to handle comparisons between positive | |||
| 46769 | // and negative zero incorrectly. | |||
| 46770 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { | |||
| 46771 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46772 | !(DAG.isKnownNeverZeroFloat(LHS) || | |||
| 46773 | DAG.isKnownNeverZeroFloat(RHS))) | |||
| 46774 | break; | |||
| 46775 | std::swap(LHS, RHS); | |||
| 46776 | } | |||
| 46777 | Opcode = X86ISD::FMIN; | |||
| 46778 | break; | |||
| 46779 | case ISD::SETOLE: | |||
| 46780 | // Converting this to a min would handle comparisons between positive | |||
| 46781 | // and negative zero incorrectly. | |||
| 46782 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46783 | !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) | |||
| 46784 | break; | |||
| 46785 | Opcode = X86ISD::FMIN; | |||
| 46786 | break; | |||
| 46787 | case ISD::SETULE: | |||
| 46788 | // Converting this to a min would handle both negative zeros and NaNs | |||
| 46789 | // incorrectly, but we can swap the operands to fix both. | |||
| 46790 | std::swap(LHS, RHS); | |||
| 46791 | [[fallthrough]]; | |||
| 46792 | case ISD::SETOLT: | |||
| 46793 | case ISD::SETLT: | |||
| 46794 | case ISD::SETLE: | |||
| 46795 | Opcode = X86ISD::FMIN; | |||
| 46796 | break; | |||
| 46797 | ||||
| 46798 | case ISD::SETOGE: | |||
| 46799 | // Converting this to a max would handle comparisons between positive | |||
| 46800 | // and negative zero incorrectly. | |||
| 46801 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46802 | !DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS)) | |||
| 46803 | break; | |||
| 46804 | Opcode = X86ISD::FMAX; | |||
| 46805 | break; | |||
| 46806 | case ISD::SETUGT: | |||
| 46807 | // Converting this to a max would handle NaNs incorrectly, and swapping | |||
| 46808 | // the operands would cause it to handle comparisons between positive | |||
| 46809 | // and negative zero incorrectly. | |||
| 46810 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) { | |||
| 46811 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46812 | !(DAG.isKnownNeverZeroFloat(LHS) || | |||
| 46813 | DAG.isKnownNeverZeroFloat(RHS))) | |||
| 46814 | break; | |||
| 46815 | std::swap(LHS, RHS); | |||
| 46816 | } | |||
| 46817 | Opcode = X86ISD::FMAX; | |||
| 46818 | break; | |||
| 46819 | case ISD::SETUGE: | |||
| 46820 | // Converting this to a max would handle both negative zeros and NaNs | |||
| 46821 | // incorrectly, but we can swap the operands to fix both. | |||
| 46822 | std::swap(LHS, RHS); | |||
| 46823 | [[fallthrough]]; | |||
| 46824 | case ISD::SETOGT: | |||
| 46825 | case ISD::SETGT: | |||
| 46826 | case ISD::SETGE: | |||
| 46827 | Opcode = X86ISD::FMAX; | |||
| 46828 | break; | |||
| 46829 | } | |||
| 46830 | // Check for x CC y ? y : x -- a min/max with reversed arms. | |||
| 46831 | } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) && | |||
| 46832 | DAG.isEqualTo(RHS, Cond.getOperand(0))) { | |||
| 46833 | switch (CC) { | |||
| 46834 | default: break; | |||
| 46835 | case ISD::SETOGE: | |||
| 46836 | // Converting this to a min would handle comparisons between positive | |||
| 46837 | // and negative zero incorrectly, and swapping the operands would | |||
| 46838 | // cause it to handle NaNs incorrectly. | |||
| 46839 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46840 | !(DAG.isKnownNeverZeroFloat(LHS) || | |||
| 46841 | DAG.isKnownNeverZeroFloat(RHS))) { | |||
| 46842 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) | |||
| 46843 | break; | |||
| 46844 | std::swap(LHS, RHS); | |||
| 46845 | } | |||
| 46846 | Opcode = X86ISD::FMIN; | |||
| 46847 | break; | |||
| 46848 | case ISD::SETUGT: | |||
| 46849 | // Converting this to a min would handle NaNs incorrectly. | |||
| 46850 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) | |||
| 46851 | break; | |||
| 46852 | Opcode = X86ISD::FMIN; | |||
| 46853 | break; | |||
| 46854 | case ISD::SETUGE: | |||
| 46855 | // Converting this to a min would handle both negative zeros and NaNs | |||
| 46856 | // incorrectly, but we can swap the operands to fix both. | |||
| 46857 | std::swap(LHS, RHS); | |||
| 46858 | [[fallthrough]]; | |||
| 46859 | case ISD::SETOGT: | |||
| 46860 | case ISD::SETGT: | |||
| 46861 | case ISD::SETGE: | |||
| 46862 | Opcode = X86ISD::FMIN; | |||
| 46863 | break; | |||
| 46864 | ||||
| 46865 | case ISD::SETULT: | |||
| 46866 | // Converting this to a max would handle NaNs incorrectly. | |||
| 46867 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) | |||
| 46868 | break; | |||
| 46869 | Opcode = X86ISD::FMAX; | |||
| 46870 | break; | |||
| 46871 | case ISD::SETOLE: | |||
| 46872 | // Converting this to a max would handle comparisons between positive | |||
| 46873 | // and negative zero incorrectly, and swapping the operands would | |||
| 46874 | // cause it to handle NaNs incorrectly. | |||
| 46875 | if (!DAG.getTarget().Options.NoSignedZerosFPMath && | |||
| 46876 | !DAG.isKnownNeverZeroFloat(LHS) && | |||
| 46877 | !DAG.isKnownNeverZeroFloat(RHS)) { | |||
| 46878 | if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) | |||
| 46879 | break; | |||
| 46880 | std::swap(LHS, RHS); | |||
| 46881 | } | |||
| 46882 | Opcode = X86ISD::FMAX; | |||
| 46883 | break; | |||
| 46884 | case ISD::SETULE: | |||
| 46885 | // Converting this to a max would handle both negative zeros and NaNs | |||
| 46886 | // incorrectly, but we can swap the operands to fix both. | |||
| 46887 | std::swap(LHS, RHS); | |||
| 46888 | [[fallthrough]]; | |||
| 46889 | case ISD::SETOLT: | |||
| 46890 | case ISD::SETLT: | |||
| 46891 | case ISD::SETLE: | |||
| 46892 | Opcode = X86ISD::FMAX; | |||
| 46893 | break; | |||
| 46894 | } | |||
| 46895 | } | |||
| 46896 | ||||
| 46897 | if (Opcode) | |||
| 46898 | return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS); | |||
| 46899 | } | |||
| 46900 | ||||
| 46901 | // Some mask scalar intrinsics rely on checking if only one bit is set | |||
| 46902 | // and implement it in C code like this: | |||
| 46903 | // A[0] = (U & 1) ? A[0] : W[0]; | |||
| 46904 | // This creates some redundant instructions that break pattern matching. | |||
| 46905 | // fold (select (setcc (and (X, 1), 0, seteq), Y, Z)) -> select(and(X, 1),Z,Y) | |||
| 46906 | if (Subtarget.hasAVX512() && N->getOpcode() == ISD::SELECT && | |||
| 46907 | Cond.getOpcode() == ISD::SETCC && (VT == MVT::f32 || VT == MVT::f64)) { | |||
| 46908 | ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); | |||
| 46909 | SDValue AndNode = Cond.getOperand(0); | |||
| 46910 | if (AndNode.getOpcode() == ISD::AND && CC == ISD::SETEQ && | |||
| 46911 | isNullConstant(Cond.getOperand(1)) && | |||
| 46912 | isOneConstant(AndNode.getOperand(1))) { | |||
| 46913 | // LHS and RHS swapped due to | |||
| 46914 | // setcc outputting 1 when AND resulted in 0 and vice versa. | |||
| 46915 | AndNode = DAG.getZExtOrTrunc(AndNode, DL, MVT::i8); | |||
| 46916 | return DAG.getNode(ISD::SELECT, DL, VT, AndNode, RHS, LHS); | |||
| 46917 | } | |||
| 46918 | } | |||
| 46919 | ||||
| 46920 | // v16i8 (select v16i1, v16i8, v16i8) does not have a proper | |||
| 46921 | // lowering on KNL. In this case we convert it to | |||
| 46922 | // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction. | |||
| 46923 | // The same situation all vectors of i8 and i16 without BWI. | |||
| 46924 | // Make sure we extend these even before type legalization gets a chance to | |||
| 46925 | // split wide vectors. | |||
| 46926 | // Since SKX these selects have a proper lowering. | |||
| 46927 | if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && CondVT.isVector() && | |||
| 46928 | CondVT.getVectorElementType() == MVT::i1 && | |||
| 46929 | (VT.getVectorElementType() == MVT::i8 || | |||
| 46930 | VT.getVectorElementType() == MVT::i16)) { | |||
| 46931 | Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond); | |||
| 46932 | return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS); | |||
| 46933 | } | |||
| 46934 | ||||
| 46935 | // AVX512 - Extend select with zero to merge with target shuffle. | |||
| 46936 | // select(mask, extract_subvector(shuffle(x)), zero) --> | |||
| 46937 | // extract_subvector(select(insert_subvector(mask), shuffle(x), zero)) | |||
| 46938 | // TODO - support non target shuffles as well. | |||
| 46939 | if (Subtarget.hasAVX512() && CondVT.isVector() && | |||
| 46940 | CondVT.getVectorElementType() == MVT::i1) { | |||
| 46941 | auto SelectableOp = [&TLI](SDValue Op) { | |||
| 46942 | return Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 46943 | isTargetShuffle(Op.getOperand(0).getOpcode()) && | |||
| 46944 | isNullConstant(Op.getOperand(1)) && | |||
| 46945 | TLI.isTypeLegal(Op.getOperand(0).getValueType()) && | |||
| 46946 | Op.hasOneUse() && Op.getOperand(0).hasOneUse(); | |||
| 46947 | }; | |||
| 46948 | ||||
| 46949 | bool SelectableLHS = SelectableOp(LHS); | |||
| 46950 | bool SelectableRHS = SelectableOp(RHS); | |||
| 46951 | bool ZeroLHS = ISD::isBuildVectorAllZeros(LHS.getNode()); | |||
| 46952 | bool ZeroRHS = ISD::isBuildVectorAllZeros(RHS.getNode()); | |||
| 46953 | ||||
| 46954 | if ((SelectableLHS && ZeroRHS) || (SelectableRHS && ZeroLHS)) { | |||
| 46955 | EVT SrcVT = SelectableLHS ? LHS.getOperand(0).getValueType() | |||
| 46956 | : RHS.getOperand(0).getValueType(); | |||
| 46957 | EVT SrcCondVT = SrcVT.changeVectorElementType(MVT::i1); | |||
| 46958 | LHS = insertSubVector(DAG.getUNDEF(SrcVT), LHS, 0, DAG, DL, | |||
| 46959 | VT.getSizeInBits()); | |||
| 46960 | RHS = insertSubVector(DAG.getUNDEF(SrcVT), RHS, 0, DAG, DL, | |||
| 46961 | VT.getSizeInBits()); | |||
| 46962 | Cond = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcCondVT, | |||
| 46963 | DAG.getUNDEF(SrcCondVT), Cond, | |||
| 46964 | DAG.getIntPtrConstant(0, DL)); | |||
| 46965 | SDValue Res = DAG.getSelect(DL, SrcVT, Cond, LHS, RHS); | |||
| 46966 | return extractSubVector(Res, 0, DAG, DL, VT.getSizeInBits()); | |||
| 46967 | } | |||
| 46968 | } | |||
| 46969 | ||||
| 46970 | if (SDValue V = combineSelectOfTwoConstants(N, DAG)) | |||
| 46971 | return V; | |||
| 46972 | ||||
| 46973 | if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC && | |||
| 46974 | Cond.hasOneUse()) { | |||
| 46975 | EVT CondVT = Cond.getValueType(); | |||
| 46976 | SDValue Cond0 = Cond.getOperand(0); | |||
| 46977 | SDValue Cond1 = Cond.getOperand(1); | |||
| 46978 | ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get(); | |||
| 46979 | ||||
| 46980 | // Canonicalize min/max: | |||
| 46981 | // (x > 0) ? x : 0 -> (x >= 0) ? x : 0 | |||
| 46982 | // (x < -1) ? x : -1 -> (x <= -1) ? x : -1 | |||
| 46983 | // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates | |||
| 46984 | // the need for an extra compare against zero. e.g. | |||
| 46985 | // (a - b) > 0 : (a - b) ? 0 -> (a - b) >= 0 : (a - b) ? 0 | |||
| 46986 | // subl %esi, %edi | |||
| 46987 | // testl %edi, %edi | |||
| 46988 | // movl $0, %eax | |||
| 46989 | // cmovgl %edi, %eax | |||
| 46990 | // => | |||
| 46991 | // xorl %eax, %eax | |||
| 46992 | // subl %esi, $edi | |||
| 46993 | // cmovsl %eax, %edi | |||
| 46994 | // | |||
| 46995 | // We can also canonicalize | |||
| 46996 | // (x s> 1) ? x : 1 -> (x s>= 1) ? x : 1 -> (x s> 0) ? x : 1 | |||
| 46997 | // (x u> 1) ? x : 1 -> (x u>= 1) ? x : 1 -> (x != 0) ? x : 1 | |||
| 46998 | // This allows the use of a test instruction for the compare. | |||
| 46999 | if (LHS == Cond0 && RHS == Cond1) { | |||
| 47000 | if ((CC == ISD::SETGT && (isNullConstant(RHS) || isOneConstant(RHS))) || | |||
| 47001 | (CC == ISD::SETLT && isAllOnesConstant(RHS))) { | |||
| 47002 | ISD::CondCode NewCC = CC == ISD::SETGT ? ISD::SETGE : ISD::SETLE; | |||
| 47003 | Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC); | |||
| 47004 | return DAG.getSelect(DL, VT, Cond, LHS, RHS); | |||
| 47005 | } | |||
| 47006 | if (CC == ISD::SETUGT && isOneConstant(RHS)) { | |||
| 47007 | ISD::CondCode NewCC = ISD::SETUGE; | |||
| 47008 | Cond = DAG.getSetCC(SDLoc(Cond), CondVT, Cond0, Cond1, NewCC); | |||
| 47009 | return DAG.getSelect(DL, VT, Cond, LHS, RHS); | |||
| 47010 | } | |||
| 47011 | } | |||
| 47012 | ||||
| 47013 | // Similar to DAGCombine's select(or(CC0,CC1),X,Y) fold but for legal types. | |||
| 47014 | // fold eq + gt/lt nested selects into ge/le selects | |||
| 47015 | // select (cmpeq Cond0, Cond1), LHS, (select (cmpugt Cond0, Cond1), LHS, Y) | |||
| 47016 | // --> (select (cmpuge Cond0, Cond1), LHS, Y) | |||
| 47017 | // select (cmpslt Cond0, Cond1), LHS, (select (cmpeq Cond0, Cond1), LHS, Y) | |||
| 47018 | // --> (select (cmpsle Cond0, Cond1), LHS, Y) | |||
| 47019 | // .. etc .. | |||
| 47020 | if (RHS.getOpcode() == ISD::SELECT && RHS.getOperand(1) == LHS && | |||
| 47021 | RHS.getOperand(0).getOpcode() == ISD::SETCC) { | |||
| 47022 | SDValue InnerSetCC = RHS.getOperand(0); | |||
| 47023 | ISD::CondCode InnerCC = | |||
| 47024 | cast<CondCodeSDNode>(InnerSetCC.getOperand(2))->get(); | |||
| 47025 | if ((CC == ISD::SETEQ || InnerCC == ISD::SETEQ) && | |||
| 47026 | Cond0 == InnerSetCC.getOperand(0) && | |||
| 47027 | Cond1 == InnerSetCC.getOperand(1)) { | |||
| 47028 | ISD::CondCode NewCC; | |||
| 47029 | switch (CC == ISD::SETEQ ? InnerCC : CC) { | |||
| 47030 | case ISD::SETGT: NewCC = ISD::SETGE; break; | |||
| 47031 | case ISD::SETLT: NewCC = ISD::SETLE; break; | |||
| 47032 | case ISD::SETUGT: NewCC = ISD::SETUGE; break; | |||
| 47033 | case ISD::SETULT: NewCC = ISD::SETULE; break; | |||
| 47034 | default: NewCC = ISD::SETCC_INVALID; break; | |||
| 47035 | } | |||
| 47036 | if (NewCC != ISD::SETCC_INVALID) { | |||
| 47037 | Cond = DAG.getSetCC(DL, CondVT, Cond0, Cond1, NewCC); | |||
| 47038 | return DAG.getSelect(DL, VT, Cond, LHS, RHS.getOperand(2)); | |||
| 47039 | } | |||
| 47040 | } | |||
| 47041 | } | |||
| 47042 | } | |||
| 47043 | ||||
| 47044 | // Check if the first operand is all zeros and Cond type is vXi1. | |||
| 47045 | // If this an avx512 target we can improve the use of zero masking by | |||
| 47046 | // swapping the operands and inverting the condition. | |||
| 47047 | if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() && | |||
| 47048 | Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 && | |||
| 47049 | ISD::isBuildVectorAllZeros(LHS.getNode()) && | |||
| 47050 | !ISD::isBuildVectorAllZeros(RHS.getNode())) { | |||
| 47051 | // Invert the cond to not(cond) : xor(op,allones)=not(op) | |||
| 47052 | SDValue CondNew = DAG.getNOT(DL, Cond, CondVT); | |||
| 47053 | // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 | |||
| 47054 | return DAG.getSelect(DL, VT, CondNew, RHS, LHS); | |||
| 47055 | } | |||
| 47056 | ||||
| 47057 | // Attempt to convert a (vXi1 bitcast(iX Cond)) selection mask before it might | |||
| 47058 | // get split by legalization. | |||
| 47059 | if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::BITCAST && | |||
| 47060 | CondVT.getVectorElementType() == MVT::i1 && | |||
| 47061 | TLI.isTypeLegal(VT.getScalarType())) { | |||
| 47062 | EVT ExtCondVT = VT.changeVectorElementTypeToInteger(); | |||
| 47063 | if (SDValue ExtCond = combineToExtendBoolVectorInReg( | |||
| 47064 | ISD::SIGN_EXTEND, DL, ExtCondVT, Cond, DAG, DCI, Subtarget)) { | |||
| 47065 | ExtCond = DAG.getNode(ISD::TRUNCATE, DL, CondVT, ExtCond); | |||
| 47066 | return DAG.getSelect(DL, VT, ExtCond, LHS, RHS); | |||
| 47067 | } | |||
| 47068 | } | |||
| 47069 | ||||
| 47070 | // Early exit check | |||
| 47071 | if (!TLI.isTypeLegal(VT) || isSoftFP16(VT, Subtarget)) | |||
| 47072 | return SDValue(); | |||
| 47073 | ||||
| 47074 | if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget)) | |||
| 47075 | return V; | |||
| 47076 | ||||
| 47077 | if (SDValue V = combineVSelectToBLENDV(N, DAG, DCI, Subtarget)) | |||
| 47078 | return V; | |||
| 47079 | ||||
| 47080 | if (SDValue V = narrowVectorSelect(N, DAG, Subtarget)) | |||
| 47081 | return V; | |||
| 47082 | ||||
| 47083 | // select(~Cond, X, Y) -> select(Cond, Y, X) | |||
| 47084 | if (CondVT.getScalarType() != MVT::i1) { | |||
| 47085 | if (SDValue CondNot = IsNOT(Cond, DAG)) | |||
| 47086 | return DAG.getNode(N->getOpcode(), DL, VT, | |||
| 47087 | DAG.getBitcast(CondVT, CondNot), RHS, LHS); | |||
| 47088 | ||||
| 47089 | // pcmpgt(X, -1) -> pcmpgt(0, X) to help select/blendv just use the | |||
| 47090 | // signbit. | |||
| 47091 | if (Cond.getOpcode() == X86ISD::PCMPGT && | |||
| 47092 | ISD::isBuildVectorAllOnes(Cond.getOperand(1).getNode()) && | |||
| 47093 | Cond.hasOneUse()) { | |||
| 47094 | Cond = DAG.getNode(X86ISD::PCMPGT, DL, CondVT, | |||
| 47095 | DAG.getConstant(0, DL, CondVT), Cond.getOperand(0)); | |||
| 47096 | return DAG.getNode(N->getOpcode(), DL, VT, Cond, RHS, LHS); | |||
| 47097 | } | |||
| 47098 | } | |||
| 47099 | ||||
| 47100 | // Try to optimize vXi1 selects if both operands are either all constants or | |||
| 47101 | // bitcasts from scalar integer type. In that case we can convert the operands | |||
| 47102 | // to integer and use an integer select which will be converted to a CMOV. | |||
| 47103 | // We need to take a little bit of care to avoid creating an i64 type after | |||
| 47104 | // type legalization. | |||
| 47105 | if (N->getOpcode() == ISD::SELECT && VT.isVector() && | |||
| 47106 | VT.getVectorElementType() == MVT::i1 && | |||
| 47107 | (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) { | |||
| 47108 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); | |||
| 47109 | if (DCI.isBeforeLegalize() || TLI.isTypeLegal(IntVT)) { | |||
| 47110 | bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode()); | |||
| 47111 | bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode()); | |||
| 47112 | ||||
| 47113 | if ((LHSIsConst || (LHS.getOpcode() == ISD::BITCAST && | |||
| 47114 | LHS.getOperand(0).getValueType() == IntVT)) && | |||
| 47115 | (RHSIsConst || (RHS.getOpcode() == ISD::BITCAST && | |||
| 47116 | RHS.getOperand(0).getValueType() == IntVT))) { | |||
| 47117 | if (LHSIsConst) | |||
| 47118 | LHS = combinevXi1ConstantToInteger(LHS, DAG); | |||
| 47119 | else | |||
| 47120 | LHS = LHS.getOperand(0); | |||
| 47121 | ||||
| 47122 | if (RHSIsConst) | |||
| 47123 | RHS = combinevXi1ConstantToInteger(RHS, DAG); | |||
| 47124 | else | |||
| 47125 | RHS = RHS.getOperand(0); | |||
| 47126 | ||||
| 47127 | SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS); | |||
| 47128 | return DAG.getBitcast(VT, Select); | |||
| 47129 | } | |||
| 47130 | } | |||
| 47131 | } | |||
| 47132 | ||||
| 47133 | // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of | |||
| 47134 | // single bits, then invert the predicate and swap the select operands. | |||
| 47135 | // This can lower using a vector shift bit-hack rather than mask and compare. | |||
| 47136 | if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() && | |||
| 47137 | N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC && | |||
| 47138 | Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 && | |||
| 47139 | Cond.getOperand(0).getOpcode() == ISD::AND && | |||
| 47140 | isNullOrNullSplat(Cond.getOperand(1)) && | |||
| 47141 | cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ && | |||
| 47142 | Cond.getOperand(0).getValueType() == VT) { | |||
| 47143 | // The 'and' mask must be composed of power-of-2 constants. | |||
| 47144 | SDValue And = Cond.getOperand(0); | |||
| 47145 | auto *C = isConstOrConstSplat(And.getOperand(1)); | |||
| 47146 | if (C && C->getAPIntValue().isPowerOf2()) { | |||
| 47147 | // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS | |||
| 47148 | SDValue NotCond = | |||
| 47149 | DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE); | |||
| 47150 | return DAG.getSelect(DL, VT, NotCond, RHS, LHS); | |||
| 47151 | } | |||
| 47152 | ||||
| 47153 | // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld | |||
| 47154 | // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply. | |||
| 47155 | // 16-bit lacks a proper blendv. | |||
| 47156 | unsigned EltBitWidth = VT.getScalarSizeInBits(); | |||
| 47157 | bool CanShiftBlend = | |||
| 47158 | TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) || | |||
| 47159 | (Subtarget.hasAVX2() && EltBitWidth == 64) || | |||
| 47160 | (Subtarget.hasXOP())); | |||
| 47161 | if (CanShiftBlend && | |||
| 47162 | ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) { | |||
| 47163 | return C->getAPIntValue().isPowerOf2(); | |||
| 47164 | })) { | |||
| 47165 | // Create a left-shift constant to get the mask bits over to the sign-bit. | |||
| 47166 | SDValue Mask = And.getOperand(1); | |||
| 47167 | SmallVector<int, 32> ShlVals; | |||
| 47168 | for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) { | |||
| 47169 | auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i)); | |||
| 47170 | ShlVals.push_back(EltBitWidth - 1 - | |||
| 47171 | MaskVal->getAPIntValue().exactLogBase2()); | |||
| 47172 | } | |||
| 47173 | // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS | |||
| 47174 | SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL); | |||
| 47175 | SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt); | |||
| 47176 | SDValue NewCond = | |||
| 47177 | DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT); | |||
| 47178 | return DAG.getSelect(DL, VT, NewCond, RHS, LHS); | |||
| 47179 | } | |||
| 47180 | } | |||
| 47181 | ||||
| 47182 | return SDValue(); | |||
| 47183 | } | |||
| 47184 | ||||
| 47185 | /// Combine: | |||
| 47186 | /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S) | |||
| 47187 | /// to: | |||
| 47188 | /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE) | |||
| 47189 | /// i.e., reusing the EFLAGS produced by the LOCKed instruction. | |||
| 47190 | /// Note that this is only legal for some op/cc combinations. | |||
| 47191 | static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, | |||
| 47192 | SelectionDAG &DAG, | |||
| 47193 | const X86Subtarget &Subtarget) { | |||
| 47194 | // This combine only operates on CMP-like nodes. | |||
| 47195 | if (!(Cmp.getOpcode() == X86ISD::CMP || | |||
| 47196 | (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) | |||
| 47197 | return SDValue(); | |||
| 47198 | ||||
| 47199 | // Can't replace the cmp if it has more uses than the one we're looking at. | |||
| 47200 | // FIXME: We would like to be able to handle this, but would need to make sure | |||
| 47201 | // all uses were updated. | |||
| 47202 | if (!Cmp.hasOneUse()) | |||
| 47203 | return SDValue(); | |||
| 47204 | ||||
| 47205 | // This only applies to variations of the common case: | |||
| 47206 | // (icmp slt x, 0) -> (icmp sle (add x, 1), 0) | |||
| 47207 | // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0) | |||
| 47208 | // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0) | |||
| 47209 | // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0) | |||
| 47210 | // Using the proper condcodes (see below), overflow is checked for. | |||
| 47211 | ||||
| 47212 | // FIXME: We can generalize both constraints: | |||
| 47213 | // - XOR/OR/AND (if they were made to survive AtomicExpand) | |||
| 47214 | // - LHS != 1 | |||
| 47215 | // if the result is compared. | |||
| 47216 | ||||
| 47217 | SDValue CmpLHS = Cmp.getOperand(0); | |||
| 47218 | SDValue CmpRHS = Cmp.getOperand(1); | |||
| 47219 | EVT CmpVT = CmpLHS.getValueType(); | |||
| 47220 | ||||
| 47221 | if (!CmpLHS.hasOneUse()) | |||
| 47222 | return SDValue(); | |||
| 47223 | ||||
| 47224 | unsigned Opc = CmpLHS.getOpcode(); | |||
| 47225 | if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) | |||
| 47226 | return SDValue(); | |||
| 47227 | ||||
| 47228 | SDValue OpRHS = CmpLHS.getOperand(2); | |||
| 47229 | auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS); | |||
| 47230 | if (!OpRHSC) | |||
| 47231 | return SDValue(); | |||
| 47232 | ||||
| 47233 | APInt Addend = OpRHSC->getAPIntValue(); | |||
| 47234 | if (Opc == ISD::ATOMIC_LOAD_SUB) | |||
| 47235 | Addend = -Addend; | |||
| 47236 | ||||
| 47237 | auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); | |||
| 47238 | if (!CmpRHSC) | |||
| 47239 | return SDValue(); | |||
| 47240 | ||||
| 47241 | APInt Comparison = CmpRHSC->getAPIntValue(); | |||
| 47242 | APInt NegAddend = -Addend; | |||
| 47243 | ||||
| 47244 | // See if we can adjust the CC to make the comparison match the negated | |||
| 47245 | // addend. | |||
| 47246 | if (Comparison != NegAddend) { | |||
| 47247 | APInt IncComparison = Comparison + 1; | |||
| 47248 | if (IncComparison == NegAddend) { | |||
| 47249 | if (CC == X86::COND_A && !Comparison.isMaxValue()) { | |||
| 47250 | Comparison = IncComparison; | |||
| 47251 | CC = X86::COND_AE; | |||
| 47252 | } else if (CC == X86::COND_LE && !Comparison.isMaxSignedValue()) { | |||
| 47253 | Comparison = IncComparison; | |||
| 47254 | CC = X86::COND_L; | |||
| 47255 | } | |||
| 47256 | } | |||
| 47257 | APInt DecComparison = Comparison - 1; | |||
| 47258 | if (DecComparison == NegAddend) { | |||
| 47259 | if (CC == X86::COND_AE && !Comparison.isMinValue()) { | |||
| 47260 | Comparison = DecComparison; | |||
| 47261 | CC = X86::COND_A; | |||
| 47262 | } else if (CC == X86::COND_L && !Comparison.isMinSignedValue()) { | |||
| 47263 | Comparison = DecComparison; | |||
| 47264 | CC = X86::COND_LE; | |||
| 47265 | } | |||
| 47266 | } | |||
| 47267 | } | |||
| 47268 | ||||
| 47269 | // If the addend is the negation of the comparison value, then we can do | |||
| 47270 | // a full comparison by emitting the atomic arithmetic as a locked sub. | |||
| 47271 | if (Comparison == NegAddend) { | |||
| 47272 | // The CC is fine, but we need to rewrite the LHS of the comparison as an | |||
| 47273 | // atomic sub. | |||
| 47274 | auto *AN = cast<AtomicSDNode>(CmpLHS.getNode()); | |||
| 47275 | auto AtomicSub = DAG.getAtomic( | |||
| 47276 | ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpVT, | |||
| 47277 | /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), | |||
| 47278 | /*RHS*/ DAG.getConstant(NegAddend, SDLoc(CmpRHS), CmpVT), | |||
| 47279 | AN->getMemOperand()); | |||
| 47280 | auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget); | |||
| 47281 | DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT)); | |||
| 47282 | DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); | |||
| 47283 | return LockOp; | |||
| 47284 | } | |||
| 47285 | ||||
| 47286 | // We can handle comparisons with zero in a number of cases by manipulating | |||
| 47287 | // the CC used. | |||
| 47288 | if (!Comparison.isZero()) | |||
| 47289 | return SDValue(); | |||
| 47290 | ||||
| 47291 | if (CC == X86::COND_S && Addend == 1) | |||
| 47292 | CC = X86::COND_LE; | |||
| 47293 | else if (CC == X86::COND_NS && Addend == 1) | |||
| 47294 | CC = X86::COND_G; | |||
| 47295 | else if (CC == X86::COND_G && Addend == -1) | |||
| 47296 | CC = X86::COND_GE; | |||
| 47297 | else if (CC == X86::COND_LE && Addend == -1) | |||
| 47298 | CC = X86::COND_L; | |||
| 47299 | else | |||
| 47300 | return SDValue(); | |||
| 47301 | ||||
| 47302 | SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); | |||
| 47303 | DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpVT)); | |||
| 47304 | DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); | |||
| 47305 | return LockOp; | |||
| 47306 | } | |||
| 47307 | ||||
| 47308 | // Check whether a boolean test is testing a boolean value generated by | |||
| 47309 | // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition | |||
| 47310 | // code. | |||
| 47311 | // | |||
| 47312 | // Simplify the following patterns: | |||
| 47313 | // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or | |||
| 47314 | // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ) | |||
| 47315 | // to (Op EFLAGS Cond) | |||
| 47316 | // | |||
| 47317 | // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or | |||
| 47318 | // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ) | |||
| 47319 | // to (Op EFLAGS !Cond) | |||
| 47320 | // | |||
| 47321 | // where Op could be BRCOND or CMOV. | |||
| 47322 | // | |||
| 47323 | static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) { | |||
| 47324 | // This combine only operates on CMP-like nodes. | |||
| 47325 | if (!(Cmp.getOpcode() == X86ISD::CMP || | |||
| 47326 | (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) | |||
| 47327 | return SDValue(); | |||
| 47328 | ||||
| 47329 | // Quit if not used as a boolean value. | |||
| 47330 | if (CC != X86::COND_E && CC != X86::COND_NE) | |||
| 47331 | return SDValue(); | |||
| 47332 | ||||
| 47333 | // Check CMP operands. One of them should be 0 or 1 and the other should be | |||
| 47334 | // an SetCC or extended from it. | |||
| 47335 | SDValue Op1 = Cmp.getOperand(0); | |||
| 47336 | SDValue Op2 = Cmp.getOperand(1); | |||
| 47337 | ||||
| 47338 | SDValue SetCC; | |||
| 47339 | const ConstantSDNode* C = nullptr; | |||
| 47340 | bool needOppositeCond = (CC == X86::COND_E); | |||
| 47341 | bool checkAgainstTrue = false; // Is it a comparison against 1? | |||
| 47342 | ||||
| 47343 | if ((C = dyn_cast<ConstantSDNode>(Op1))) | |||
| 47344 | SetCC = Op2; | |||
| 47345 | else if ((C = dyn_cast<ConstantSDNode>(Op2))) | |||
| 47346 | SetCC = Op1; | |||
| 47347 | else // Quit if all operands are not constants. | |||
| 47348 | return SDValue(); | |||
| 47349 | ||||
| 47350 | if (C->getZExtValue() == 1) { | |||
| 47351 | needOppositeCond = !needOppositeCond; | |||
| 47352 | checkAgainstTrue = true; | |||
| 47353 | } else if (C->getZExtValue() != 0) | |||
| 47354 | // Quit if the constant is neither 0 or 1. | |||
| 47355 | return SDValue(); | |||
| 47356 | ||||
| 47357 | bool truncatedToBoolWithAnd = false; | |||
| 47358 | // Skip (zext $x), (trunc $x), or (and $x, 1) node. | |||
| 47359 | while (SetCC.getOpcode() == ISD::ZERO_EXTEND || | |||
| 47360 | SetCC.getOpcode() == ISD::TRUNCATE || | |||
| 47361 | SetCC.getOpcode() == ISD::AND) { | |||
| 47362 | if (SetCC.getOpcode() == ISD::AND) { | |||
| 47363 | int OpIdx = -1; | |||
| 47364 | if (isOneConstant(SetCC.getOperand(0))) | |||
| 47365 | OpIdx = 1; | |||
| 47366 | if (isOneConstant(SetCC.getOperand(1))) | |||
| 47367 | OpIdx = 0; | |||
| 47368 | if (OpIdx < 0) | |||
| 47369 | break; | |||
| 47370 | SetCC = SetCC.getOperand(OpIdx); | |||
| 47371 | truncatedToBoolWithAnd = true; | |||
| 47372 | } else | |||
| 47373 | SetCC = SetCC.getOperand(0); | |||
| 47374 | } | |||
| 47375 | ||||
| 47376 | switch (SetCC.getOpcode()) { | |||
| 47377 | case X86ISD::SETCC_CARRY: | |||
| 47378 | // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to | |||
| 47379 | // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1, | |||
| 47380 | // i.e. it's a comparison against true but the result of SETCC_CARRY is not | |||
| 47381 | // truncated to i1 using 'and'. | |||
| 47382 | if (checkAgainstTrue && !truncatedToBoolWithAnd) | |||
| 47383 | break; | |||
| 47384 | assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal (0)) == X86::COND_B && "Invalid use of SETCC_CARRY!") ? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__ __PRETTY_FUNCTION__)) | |||
| 47385 | "Invalid use of SETCC_CARRY!")(static_cast <bool> (X86::CondCode(SetCC.getConstantOperandVal (0)) == X86::COND_B && "Invalid use of SETCC_CARRY!") ? void (0) : __assert_fail ("X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B && \"Invalid use of SETCC_CARRY!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47385, __extension__ __PRETTY_FUNCTION__)); | |||
| 47386 | [[fallthrough]]; | |||
| 47387 | case X86ISD::SETCC: | |||
| 47388 | // Set the condition code or opposite one if necessary. | |||
| 47389 | CC = X86::CondCode(SetCC.getConstantOperandVal(0)); | |||
| 47390 | if (needOppositeCond) | |||
| 47391 | CC = X86::GetOppositeBranchCondition(CC); | |||
| 47392 | return SetCC.getOperand(1); | |||
| 47393 | case X86ISD::CMOV: { | |||
| 47394 | // Check whether false/true value has canonical one, i.e. 0 or 1. | |||
| 47395 | ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0)); | |||
| 47396 | ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1)); | |||
| 47397 | // Quit if true value is not a constant. | |||
| 47398 | if (!TVal) | |||
| 47399 | return SDValue(); | |||
| 47400 | // Quit if false value is not a constant. | |||
| 47401 | if (!FVal) { | |||
| 47402 | SDValue Op = SetCC.getOperand(0); | |||
| 47403 | // Skip 'zext' or 'trunc' node. | |||
| 47404 | if (Op.getOpcode() == ISD::ZERO_EXTEND || | |||
| 47405 | Op.getOpcode() == ISD::TRUNCATE) | |||
| 47406 | Op = Op.getOperand(0); | |||
| 47407 | // A special case for rdrand/rdseed, where 0 is set if false cond is | |||
| 47408 | // found. | |||
| 47409 | if ((Op.getOpcode() != X86ISD::RDRAND && | |||
| 47410 | Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0) | |||
| 47411 | return SDValue(); | |||
| 47412 | } | |||
| 47413 | // Quit if false value is not the constant 0 or 1. | |||
| 47414 | bool FValIsFalse = true; | |||
| 47415 | if (FVal && FVal->getZExtValue() != 0) { | |||
| 47416 | if (FVal->getZExtValue() != 1) | |||
| 47417 | return SDValue(); | |||
| 47418 | // If FVal is 1, opposite cond is needed. | |||
| 47419 | needOppositeCond = !needOppositeCond; | |||
| 47420 | FValIsFalse = false; | |||
| 47421 | } | |||
| 47422 | // Quit if TVal is not the constant opposite of FVal. | |||
| 47423 | if (FValIsFalse && TVal->getZExtValue() != 1) | |||
| 47424 | return SDValue(); | |||
| 47425 | if (!FValIsFalse && TVal->getZExtValue() != 0) | |||
| 47426 | return SDValue(); | |||
| 47427 | CC = X86::CondCode(SetCC.getConstantOperandVal(2)); | |||
| 47428 | if (needOppositeCond) | |||
| 47429 | CC = X86::GetOppositeBranchCondition(CC); | |||
| 47430 | return SetCC.getOperand(3); | |||
| 47431 | } | |||
| 47432 | } | |||
| 47433 | ||||
| 47434 | return SDValue(); | |||
| 47435 | } | |||
| 47436 | ||||
| 47437 | /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS. | |||
| 47438 | /// Match: | |||
| 47439 | /// (X86or (X86setcc) (X86setcc)) | |||
| 47440 | /// (X86cmp (and (X86setcc) (X86setcc)), 0) | |||
| 47441 | static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0, | |||
| 47442 | X86::CondCode &CC1, SDValue &Flags, | |||
| 47443 | bool &isAnd) { | |||
| 47444 | if (Cond->getOpcode() == X86ISD::CMP) { | |||
| 47445 | if (!isNullConstant(Cond->getOperand(1))) | |||
| 47446 | return false; | |||
| 47447 | ||||
| 47448 | Cond = Cond->getOperand(0); | |||
| 47449 | } | |||
| 47450 | ||||
| 47451 | isAnd = false; | |||
| 47452 | ||||
| 47453 | SDValue SetCC0, SetCC1; | |||
| 47454 | switch (Cond->getOpcode()) { | |||
| 47455 | default: return false; | |||
| 47456 | case ISD::AND: | |||
| 47457 | case X86ISD::AND: | |||
| 47458 | isAnd = true; | |||
| 47459 | [[fallthrough]]; | |||
| 47460 | case ISD::OR: | |||
| 47461 | case X86ISD::OR: | |||
| 47462 | SetCC0 = Cond->getOperand(0); | |||
| 47463 | SetCC1 = Cond->getOperand(1); | |||
| 47464 | break; | |||
| 47465 | }; | |||
| 47466 | ||||
| 47467 | // Make sure we have SETCC nodes, using the same flags value. | |||
| 47468 | if (SetCC0.getOpcode() != X86ISD::SETCC || | |||
| 47469 | SetCC1.getOpcode() != X86ISD::SETCC || | |||
| 47470 | SetCC0->getOperand(1) != SetCC1->getOperand(1)) | |||
| 47471 | return false; | |||
| 47472 | ||||
| 47473 | CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0); | |||
| 47474 | CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0); | |||
| 47475 | Flags = SetCC0->getOperand(1); | |||
| 47476 | return true; | |||
| 47477 | } | |||
| 47478 | ||||
| 47479 | // When legalizing carry, we create carries via add X, -1 | |||
| 47480 | // If that comes from an actual carry, via setcc, we use the | |||
| 47481 | // carry directly. | |||
| 47482 | static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) { | |||
| 47483 | if (EFLAGS.getOpcode() == X86ISD::ADD) { | |||
| 47484 | if (isAllOnesConstant(EFLAGS.getOperand(1))) { | |||
| 47485 | bool FoundAndLSB = false; | |||
| 47486 | SDValue Carry = EFLAGS.getOperand(0); | |||
| 47487 | while (Carry.getOpcode() == ISD::TRUNCATE || | |||
| 47488 | Carry.getOpcode() == ISD::ZERO_EXTEND || | |||
| 47489 | (Carry.getOpcode() == ISD::AND && | |||
| 47490 | isOneConstant(Carry.getOperand(1)))) { | |||
| 47491 | FoundAndLSB |= Carry.getOpcode() == ISD::AND; | |||
| 47492 | Carry = Carry.getOperand(0); | |||
| 47493 | } | |||
| 47494 | if (Carry.getOpcode() == X86ISD::SETCC || | |||
| 47495 | Carry.getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 47496 | // TODO: Merge this code with equivalent in combineAddOrSubToADCOrSBB? | |||
| 47497 | uint64_t CarryCC = Carry.getConstantOperandVal(0); | |||
| 47498 | SDValue CarryOp1 = Carry.getOperand(1); | |||
| 47499 | if (CarryCC == X86::COND_B) | |||
| 47500 | return CarryOp1; | |||
| 47501 | if (CarryCC == X86::COND_A) { | |||
| 47502 | // Try to convert COND_A into COND_B in an attempt to facilitate | |||
| 47503 | // materializing "setb reg". | |||
| 47504 | // | |||
| 47505 | // Do not flip "e > c", where "c" is a constant, because Cmp | |||
| 47506 | // instruction cannot take an immediate as its first operand. | |||
| 47507 | // | |||
| 47508 | if (CarryOp1.getOpcode() == X86ISD::SUB && | |||
| 47509 | CarryOp1.getNode()->hasOneUse() && | |||
| 47510 | CarryOp1.getValueType().isInteger() && | |||
| 47511 | !isa<ConstantSDNode>(CarryOp1.getOperand(1))) { | |||
| 47512 | SDValue SubCommute = | |||
| 47513 | DAG.getNode(X86ISD::SUB, SDLoc(CarryOp1), CarryOp1->getVTList(), | |||
| 47514 | CarryOp1.getOperand(1), CarryOp1.getOperand(0)); | |||
| 47515 | return SDValue(SubCommute.getNode(), CarryOp1.getResNo()); | |||
| 47516 | } | |||
| 47517 | } | |||
| 47518 | // If this is a check of the z flag of an add with 1, switch to the | |||
| 47519 | // C flag. | |||
| 47520 | if (CarryCC == X86::COND_E && | |||
| 47521 | CarryOp1.getOpcode() == X86ISD::ADD && | |||
| 47522 | isOneConstant(CarryOp1.getOperand(1))) | |||
| 47523 | return CarryOp1; | |||
| 47524 | } else if (FoundAndLSB) { | |||
| 47525 | SDLoc DL(Carry); | |||
| 47526 | SDValue BitNo = DAG.getConstant(0, DL, Carry.getValueType()); | |||
| 47527 | if (Carry.getOpcode() == ISD::SRL) { | |||
| 47528 | BitNo = Carry.getOperand(1); | |||
| 47529 | Carry = Carry.getOperand(0); | |||
| 47530 | } | |||
| 47531 | return getBT(Carry, BitNo, DL, DAG); | |||
| 47532 | } | |||
| 47533 | } | |||
| 47534 | } | |||
| 47535 | ||||
| 47536 | return SDValue(); | |||
| 47537 | } | |||
| 47538 | ||||
| 47539 | /// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC | |||
| 47540 | /// to avoid the inversion. | |||
| 47541 | static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC, | |||
| 47542 | SelectionDAG &DAG, | |||
| 47543 | const X86Subtarget &Subtarget) { | |||
| 47544 | // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST. | |||
| 47545 | if (EFLAGS.getOpcode() != X86ISD::PTEST && | |||
| 47546 | EFLAGS.getOpcode() != X86ISD::TESTP) | |||
| 47547 | return SDValue(); | |||
| 47548 | ||||
| 47549 | // PTEST/TESTP sets EFLAGS as: | |||
| 47550 | // TESTZ: ZF = (Op0 & Op1) == 0 | |||
| 47551 | // TESTC: CF = (~Op0 & Op1) == 0 | |||
| 47552 | // TESTNZC: ZF == 0 && CF == 0 | |||
| 47553 | MVT VT = EFLAGS.getSimpleValueType(); | |||
| 47554 | SDValue Op0 = EFLAGS.getOperand(0); | |||
| 47555 | SDValue Op1 = EFLAGS.getOperand(1); | |||
| 47556 | MVT OpVT = Op0.getSimpleValueType(); | |||
| 47557 | ||||
| 47558 | // TEST*(~X,Y) == TEST*(X,Y) | |||
| 47559 | if (SDValue NotOp0 = IsNOT(Op0, DAG)) { | |||
| 47560 | X86::CondCode InvCC; | |||
| 47561 | switch (CC) { | |||
| 47562 | case X86::COND_B: | |||
| 47563 | // testc -> testz. | |||
| 47564 | InvCC = X86::COND_E; | |||
| 47565 | break; | |||
| 47566 | case X86::COND_AE: | |||
| 47567 | // !testc -> !testz. | |||
| 47568 | InvCC = X86::COND_NE; | |||
| 47569 | break; | |||
| 47570 | case X86::COND_E: | |||
| 47571 | // testz -> testc. | |||
| 47572 | InvCC = X86::COND_B; | |||
| 47573 | break; | |||
| 47574 | case X86::COND_NE: | |||
| 47575 | // !testz -> !testc. | |||
| 47576 | InvCC = X86::COND_AE; | |||
| 47577 | break; | |||
| 47578 | case X86::COND_A: | |||
| 47579 | case X86::COND_BE: | |||
| 47580 | // testnzc -> testnzc (no change). | |||
| 47581 | InvCC = CC; | |||
| 47582 | break; | |||
| 47583 | default: | |||
| 47584 | InvCC = X86::COND_INVALID; | |||
| 47585 | break; | |||
| 47586 | } | |||
| 47587 | ||||
| 47588 | if (InvCC != X86::COND_INVALID) { | |||
| 47589 | CC = InvCC; | |||
| 47590 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, | |||
| 47591 | DAG.getBitcast(OpVT, NotOp0), Op1); | |||
| 47592 | } | |||
| 47593 | } | |||
| 47594 | ||||
| 47595 | if (CC == X86::COND_B || CC == X86::COND_AE) { | |||
| 47596 | // TESTC(X,~X) == TESTC(X,-1) | |||
| 47597 | if (SDValue NotOp1 = IsNOT(Op1, DAG)) { | |||
| 47598 | if (peekThroughBitcasts(NotOp1) == peekThroughBitcasts(Op0)) { | |||
| 47599 | SDLoc DL(EFLAGS); | |||
| 47600 | return DAG.getNode( | |||
| 47601 | EFLAGS.getOpcode(), DL, VT, DAG.getBitcast(OpVT, NotOp1), | |||
| 47602 | DAG.getBitcast(OpVT, | |||
| 47603 | DAG.getAllOnesConstant(DL, NotOp1.getValueType()))); | |||
| 47604 | } | |||
| 47605 | } | |||
| 47606 | } | |||
| 47607 | ||||
| 47608 | if (CC == X86::COND_E || CC == X86::COND_NE) { | |||
| 47609 | // TESTZ(X,~Y) == TESTC(Y,X) | |||
| 47610 | if (SDValue NotOp1 = IsNOT(Op1, DAG)) { | |||
| 47611 | CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); | |||
| 47612 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, | |||
| 47613 | DAG.getBitcast(OpVT, NotOp1), Op0); | |||
| 47614 | } | |||
| 47615 | ||||
| 47616 | if (Op0 == Op1) { | |||
| 47617 | SDValue BC = peekThroughBitcasts(Op0); | |||
| 47618 | EVT BCVT = BC.getValueType(); | |||
| 47619 | ||||
| 47620 | // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y) | |||
| 47621 | if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) { | |||
| 47622 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, | |||
| 47623 | DAG.getBitcast(OpVT, BC.getOperand(0)), | |||
| 47624 | DAG.getBitcast(OpVT, BC.getOperand(1))); | |||
| 47625 | } | |||
| 47626 | ||||
| 47627 | // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y) | |||
| 47628 | if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) { | |||
| 47629 | CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); | |||
| 47630 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, | |||
| 47631 | DAG.getBitcast(OpVT, BC.getOperand(0)), | |||
| 47632 | DAG.getBitcast(OpVT, BC.getOperand(1))); | |||
| 47633 | } | |||
| 47634 | ||||
| 47635 | // If every element is an all-sign value, see if we can use TESTP/MOVMSK | |||
| 47636 | // to more efficiently extract the sign bits and compare that. | |||
| 47637 | // TODO: Handle TESTC with comparison inversion. | |||
| 47638 | // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on | |||
| 47639 | // TESTP/MOVMSK combines to make sure its never worse than PTEST? | |||
| 47640 | if (BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) { | |||
| 47641 | unsigned EltBits = BCVT.getScalarSizeInBits(); | |||
| 47642 | if (DAG.ComputeNumSignBits(BC) == EltBits) { | |||
| 47643 | assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result")(static_cast <bool> (VT == MVT::i32 && "Expected i32 EFLAGS comparison result" ) ? void (0) : __assert_fail ("VT == MVT::i32 && \"Expected i32 EFLAGS comparison result\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47643, __extension__ __PRETTY_FUNCTION__)); | |||
| 47644 | APInt SignMask = APInt::getSignMask(EltBits); | |||
| 47645 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 47646 | if (SDValue Res = | |||
| 47647 | TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) { | |||
| 47648 | // For vXi16 cases we need to use pmovmksb and extract every other | |||
| 47649 | // sign bit. | |||
| 47650 | SDLoc DL(EFLAGS); | |||
| 47651 | if ((EltBits == 32 || EltBits == 64) && Subtarget.hasAVX()) { | |||
| 47652 | MVT FloatSVT = MVT::getFloatingPointVT(EltBits); | |||
| 47653 | MVT FloatVT = | |||
| 47654 | MVT::getVectorVT(FloatSVT, OpVT.getSizeInBits() / EltBits); | |||
| 47655 | Res = DAG.getBitcast(FloatVT, Res); | |||
| 47656 | return DAG.getNode(X86ISD::TESTP, SDLoc(EFLAGS), VT, Res, Res); | |||
| 47657 | } else if (EltBits == 16) { | |||
| 47658 | MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8; | |||
| 47659 | Res = DAG.getBitcast(MovmskVT, Res); | |||
| 47660 | Res = getPMOVMSKB(DL, Res, DAG, Subtarget); | |||
| 47661 | Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res, | |||
| 47662 | DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); | |||
| 47663 | } else { | |||
| 47664 | Res = getPMOVMSKB(DL, Res, DAG, Subtarget); | |||
| 47665 | } | |||
| 47666 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res, | |||
| 47667 | DAG.getConstant(0, DL, MVT::i32)); | |||
| 47668 | } | |||
| 47669 | } | |||
| 47670 | } | |||
| 47671 | } | |||
| 47672 | ||||
| 47673 | // TESTZ(-1,X) == TESTZ(X,X) | |||
| 47674 | if (ISD::isBuildVectorAllOnes(Op0.getNode())) | |||
| 47675 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1); | |||
| 47676 | ||||
| 47677 | // TESTZ(X,-1) == TESTZ(X,X) | |||
| 47678 | if (ISD::isBuildVectorAllOnes(Op1.getNode())) | |||
| 47679 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0); | |||
| 47680 | ||||
| 47681 | // TESTZ(OR(LO(X),HI(X)),OR(LO(Y),HI(Y))) -> TESTZ(X,Y) | |||
| 47682 | // TODO: Add COND_NE handling? | |||
| 47683 | if (CC == X86::COND_E && OpVT.is128BitVector() && Subtarget.hasAVX()) { | |||
| 47684 | SDValue Src0 = peekThroughBitcasts(Op0); | |||
| 47685 | SDValue Src1 = peekThroughBitcasts(Op1); | |||
| 47686 | if (Src0.getOpcode() == ISD::OR && Src1.getOpcode() == ISD::OR) { | |||
| 47687 | Src0 = getSplitVectorSrc(peekThroughBitcasts(Src0.getOperand(0)), | |||
| 47688 | peekThroughBitcasts(Src0.getOperand(1)), true); | |||
| 47689 | Src1 = getSplitVectorSrc(peekThroughBitcasts(Src1.getOperand(0)), | |||
| 47690 | peekThroughBitcasts(Src1.getOperand(1)), true); | |||
| 47691 | if (Src0 && Src1) { | |||
| 47692 | MVT OpVT2 = OpVT.getDoubleNumVectorElementsVT(); | |||
| 47693 | return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, | |||
| 47694 | DAG.getBitcast(OpVT2, Src0), | |||
| 47695 | DAG.getBitcast(OpVT2, Src1)); | |||
| 47696 | } | |||
| 47697 | } | |||
| 47698 | } | |||
| 47699 | } | |||
| 47700 | ||||
| 47701 | return SDValue(); | |||
| 47702 | } | |||
| 47703 | ||||
| 47704 | // Attempt to simplify the MOVMSK input based on the comparison type. | |||
| 47705 | static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC, | |||
| 47706 | SelectionDAG &DAG, | |||
| 47707 | const X86Subtarget &Subtarget) { | |||
| 47708 | // Handle eq/ne against zero (any_of). | |||
| 47709 | // Handle eq/ne against -1 (all_of). | |||
| 47710 | if (!(CC == X86::COND_E || CC == X86::COND_NE)) | |||
| 47711 | return SDValue(); | |||
| 47712 | if (EFLAGS.getValueType() != MVT::i32) | |||
| 47713 | return SDValue(); | |||
| 47714 | unsigned CmpOpcode = EFLAGS.getOpcode(); | |||
| 47715 | if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB) | |||
| 47716 | return SDValue(); | |||
| 47717 | auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1)); | |||
| 47718 | if (!CmpConstant) | |||
| 47719 | return SDValue(); | |||
| 47720 | const APInt &CmpVal = CmpConstant->getAPIntValue(); | |||
| 47721 | ||||
| 47722 | SDValue CmpOp = EFLAGS.getOperand(0); | |||
| 47723 | unsigned CmpBits = CmpOp.getValueSizeInBits(); | |||
| 47724 | assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch")(static_cast <bool> (CmpBits == CmpVal.getBitWidth() && "Value size mismatch") ? void (0) : __assert_fail ("CmpBits == CmpVal.getBitWidth() && \"Value size mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47724, __extension__ __PRETTY_FUNCTION__)); | |||
| 47725 | ||||
| 47726 | // Peek through any truncate. | |||
| 47727 | if (CmpOp.getOpcode() == ISD::TRUNCATE) | |||
| 47728 | CmpOp = CmpOp.getOperand(0); | |||
| 47729 | ||||
| 47730 | // Bail if we don't find a MOVMSK. | |||
| 47731 | if (CmpOp.getOpcode() != X86ISD::MOVMSK) | |||
| 47732 | return SDValue(); | |||
| 47733 | ||||
| 47734 | SDValue Vec = CmpOp.getOperand(0); | |||
| 47735 | MVT VecVT = Vec.getSimpleValueType(); | |||
| 47736 | assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector ()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__ __PRETTY_FUNCTION__)) | |||
| 47737 | "Unexpected MOVMSK operand")(static_cast <bool> ((VecVT.is128BitVector() || VecVT.is256BitVector ()) && "Unexpected MOVMSK operand") ? void (0) : __assert_fail ("(VecVT.is128BitVector() || VecVT.is256BitVector()) && \"Unexpected MOVMSK operand\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47737, __extension__ __PRETTY_FUNCTION__)); | |||
| 47738 | unsigned NumElts = VecVT.getVectorNumElements(); | |||
| 47739 | unsigned NumEltBits = VecVT.getScalarSizeInBits(); | |||
| 47740 | ||||
| 47741 | bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isZero(); | |||
| 47742 | bool IsAllOf = (CmpOpcode == X86ISD::SUB || CmpOpcode == X86ISD::CMP) && | |||
| 47743 | NumElts <= CmpBits && CmpVal.isMask(NumElts); | |||
| 47744 | if (!IsAnyOf && !IsAllOf) | |||
| 47745 | return SDValue(); | |||
| 47746 | ||||
| 47747 | // TODO: Check more combining cases for me. | |||
| 47748 | // Here we check the cmp use number to decide do combining or not. | |||
| 47749 | // Currently we only get 2 tests about combining "MOVMSK(CONCAT(..))" | |||
| 47750 | // and "MOVMSK(PCMPEQ(..))" are fit to use this constraint. | |||
| 47751 | bool IsOneUse = CmpOp.getNode()->hasOneUse(); | |||
| 47752 | ||||
| 47753 | // See if we can peek through to a vector with a wider element type, if the | |||
| 47754 | // signbits extend down to all the sub-elements as well. | |||
| 47755 | // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose | |||
| 47756 | // potential SimplifyDemandedBits/Elts cases. | |||
| 47757 | // If we looked through a truncate that discard bits, we can't do this | |||
| 47758 | // transform. | |||
| 47759 | // FIXME: We could do this transform for truncates that discarded bits by | |||
| 47760 | // inserting an AND mask between the new MOVMSK and the CMP. | |||
| 47761 | if (Vec.getOpcode() == ISD::BITCAST && NumElts <= CmpBits) { | |||
| 47762 | SDValue BC = peekThroughBitcasts(Vec); | |||
| 47763 | MVT BCVT = BC.getSimpleValueType(); | |||
| 47764 | unsigned BCNumElts = BCVT.getVectorNumElements(); | |||
| 47765 | unsigned BCNumEltBits = BCVT.getScalarSizeInBits(); | |||
| 47766 | if ((BCNumEltBits == 32 || BCNumEltBits == 64) && | |||
| 47767 | BCNumEltBits > NumEltBits && | |||
| 47768 | DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) { | |||
| 47769 | SDLoc DL(EFLAGS); | |||
| 47770 | APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : BCNumElts); | |||
| 47771 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, | |||
| 47772 | DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC), | |||
| 47773 | DAG.getConstant(CmpMask, DL, MVT::i32)); | |||
| 47774 | } | |||
| 47775 | } | |||
| 47776 | ||||
| 47777 | // MOVMSK(CONCAT(X,Y)) == 0 -> MOVMSK(OR(X,Y)). | |||
| 47778 | // MOVMSK(CONCAT(X,Y)) != 0 -> MOVMSK(OR(X,Y)). | |||
| 47779 | // MOVMSK(CONCAT(X,Y)) == -1 -> MOVMSK(AND(X,Y)). | |||
| 47780 | // MOVMSK(CONCAT(X,Y)) != -1 -> MOVMSK(AND(X,Y)). | |||
| 47781 | if (VecVT.is256BitVector() && NumElts <= CmpBits && IsOneUse) { | |||
| 47782 | SmallVector<SDValue> Ops; | |||
| 47783 | if (collectConcatOps(peekThroughBitcasts(Vec).getNode(), Ops, DAG) && | |||
| 47784 | Ops.size() == 2) { | |||
| 47785 | SDLoc DL(EFLAGS); | |||
| 47786 | EVT SubVT = Ops[0].getValueType().changeTypeToInteger(); | |||
| 47787 | APInt CmpMask = APInt::getLowBitsSet(32, IsAnyOf ? 0 : NumElts / 2); | |||
| 47788 | SDValue V = DAG.getNode(IsAnyOf ? ISD::OR : ISD::AND, DL, SubVT, | |||
| 47789 | DAG.getBitcast(SubVT, Ops[0]), | |||
| 47790 | DAG.getBitcast(SubVT, Ops[1])); | |||
| 47791 | V = DAG.getBitcast(VecVT.getHalfNumVectorElementsVT(), V); | |||
| 47792 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, | |||
| 47793 | DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V), | |||
| 47794 | DAG.getConstant(CmpMask, DL, MVT::i32)); | |||
| 47795 | } | |||
| 47796 | } | |||
| 47797 | ||||
| 47798 | // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X). | |||
| 47799 | // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X). | |||
| 47800 | // MOVMSK(PCMPEQ(X,Y)) == -1 -> PTESTZ(XOR(X,Y),XOR(X,Y)). | |||
| 47801 | // MOVMSK(PCMPEQ(X,Y)) != -1 -> !PTESTZ(XOR(X,Y),XOR(X,Y)). | |||
| 47802 | if (IsAllOf && Subtarget.hasSSE41() && IsOneUse) { | |||
| 47803 | MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64; | |||
| 47804 | SDValue BC = peekThroughBitcasts(Vec); | |||
| 47805 | // Ensure MOVMSK was testing every signbit of BC. | |||
| 47806 | if (BC.getValueType().getVectorNumElements() <= NumElts) { | |||
| 47807 | if (BC.getOpcode() == X86ISD::PCMPEQ) { | |||
| 47808 | SDValue V = DAG.getNode(ISD::XOR, SDLoc(BC), BC.getValueType(), | |||
| 47809 | BC.getOperand(0), BC.getOperand(1)); | |||
| 47810 | V = DAG.getBitcast(TestVT, V); | |||
| 47811 | return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); | |||
| 47812 | } | |||
| 47813 | // Check for 256-bit split vector cases. | |||
| 47814 | if (BC.getOpcode() == ISD::AND && | |||
| 47815 | BC.getOperand(0).getOpcode() == X86ISD::PCMPEQ && | |||
| 47816 | BC.getOperand(1).getOpcode() == X86ISD::PCMPEQ) { | |||
| 47817 | SDValue LHS = BC.getOperand(0); | |||
| 47818 | SDValue RHS = BC.getOperand(1); | |||
| 47819 | LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), LHS.getValueType(), | |||
| 47820 | LHS.getOperand(0), LHS.getOperand(1)); | |||
| 47821 | RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), RHS.getValueType(), | |||
| 47822 | RHS.getOperand(0), RHS.getOperand(1)); | |||
| 47823 | LHS = DAG.getBitcast(TestVT, LHS); | |||
| 47824 | RHS = DAG.getBitcast(TestVT, RHS); | |||
| 47825 | SDValue V = DAG.getNode(ISD::OR, SDLoc(EFLAGS), TestVT, LHS, RHS); | |||
| 47826 | return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); | |||
| 47827 | } | |||
| 47828 | } | |||
| 47829 | } | |||
| 47830 | ||||
| 47831 | // See if we can avoid a PACKSS by calling MOVMSK on the sources. | |||
| 47832 | // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out | |||
| 47833 | // sign bits prior to the comparison with zero unless we know that | |||
| 47834 | // the vXi16 splats the sign bit down to the lower i8 half. | |||
| 47835 | // TODO: Handle all_of patterns. | |||
| 47836 | if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) { | |||
| 47837 | SDValue VecOp0 = Vec.getOperand(0); | |||
| 47838 | SDValue VecOp1 = Vec.getOperand(1); | |||
| 47839 | bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8; | |||
| 47840 | bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8; | |||
| 47841 | // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA. | |||
| 47842 | if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) { | |||
| 47843 | SDLoc DL(EFLAGS); | |||
| 47844 | SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0); | |||
| 47845 | Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); | |||
| 47846 | Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16); | |||
| 47847 | if (!SignExt0) { | |||
| 47848 | Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result, | |||
| 47849 | DAG.getConstant(0xAAAA, DL, MVT::i16)); | |||
| 47850 | } | |||
| 47851 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, | |||
| 47852 | DAG.getConstant(0, DL, MVT::i16)); | |||
| 47853 | } | |||
| 47854 | // PMOVMSKB(PACKSSBW(LO(X), HI(X))) | |||
| 47855 | // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA. | |||
| 47856 | if (CmpBits >= 16 && Subtarget.hasInt256() && | |||
| 47857 | (IsAnyOf || (SignExt0 && SignExt1))) { | |||
| 47858 | if (SDValue Src = getSplitVectorSrc(VecOp0, VecOp1, true)) { | |||
| 47859 | SDLoc DL(EFLAGS); | |||
| 47860 | SDValue Result = peekThroughBitcasts(Src); | |||
| 47861 | if (IsAllOf && Result.getOpcode() == X86ISD::PCMPEQ && | |||
| 47862 | Result.getValueType().getVectorNumElements() <= NumElts) { | |||
| 47863 | SDValue V = DAG.getNode(ISD::XOR, DL, Result.getValueType(), | |||
| 47864 | Result.getOperand(0), Result.getOperand(1)); | |||
| 47865 | V = DAG.getBitcast(MVT::v4i64, V); | |||
| 47866 | return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V); | |||
| 47867 | } | |||
| 47868 | Result = DAG.getBitcast(MVT::v32i8, Result); | |||
| 47869 | Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); | |||
| 47870 | unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF; | |||
| 47871 | if (!SignExt0 || !SignExt1) { | |||
| 47872 | assert(IsAnyOf &&(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns" ) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__ __PRETTY_FUNCTION__)) | |||
| 47873 | "Only perform v16i16 signmasks for any_of patterns")(static_cast <bool> (IsAnyOf && "Only perform v16i16 signmasks for any_of patterns" ) ? void (0) : __assert_fail ("IsAnyOf && \"Only perform v16i16 signmasks for any_of patterns\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47873, __extension__ __PRETTY_FUNCTION__)); | |||
| 47874 | Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result, | |||
| 47875 | DAG.getConstant(0xAAAAAAAA, DL, MVT::i32)); | |||
| 47876 | } | |||
| 47877 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, | |||
| 47878 | DAG.getConstant(CmpMask, DL, MVT::i32)); | |||
| 47879 | } | |||
| 47880 | } | |||
| 47881 | } | |||
| 47882 | ||||
| 47883 | // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced. | |||
| 47884 | SmallVector<int, 32> ShuffleMask; | |||
| 47885 | SmallVector<SDValue, 2> ShuffleInputs; | |||
| 47886 | if (NumElts <= CmpBits && | |||
| 47887 | getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs, | |||
| 47888 | ShuffleMask, DAG) && | |||
| 47889 | ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) && | |||
| 47890 | ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) { | |||
| 47891 | unsigned NumShuffleElts = ShuffleMask.size(); | |||
| 47892 | APInt DemandedElts = APInt::getZero(NumShuffleElts); | |||
| 47893 | for (int M : ShuffleMask) { | |||
| 47894 | assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index")(static_cast <bool> (0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index") ? void (0) : __assert_fail ("0 <= M && M < (int)NumShuffleElts && \"Bad unary shuffle index\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 47894, __extension__ __PRETTY_FUNCTION__)); | |||
| 47895 | DemandedElts.setBit(M); | |||
| 47896 | } | |||
| 47897 | if (DemandedElts.isAllOnes()) { | |||
| 47898 | SDLoc DL(EFLAGS); | |||
| 47899 | SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]); | |||
| 47900 | Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); | |||
| 47901 | Result = | |||
| 47902 | DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType()); | |||
| 47903 | return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result, | |||
| 47904 | EFLAGS.getOperand(1)); | |||
| 47905 | } | |||
| 47906 | } | |||
| 47907 | ||||
| 47908 | // MOVMSKPS(V) !=/== 0 -> TESTPS(V,V) | |||
| 47909 | // MOVMSKPD(V) !=/== 0 -> TESTPD(V,V) | |||
| 47910 | // MOVMSKPS(V) !=/== -1 -> TESTPS(V,V) | |||
| 47911 | // MOVMSKPD(V) !=/== -1 -> TESTPD(V,V) | |||
| 47912 | // iff every element is referenced. | |||
| 47913 | if (NumElts <= CmpBits && Subtarget.hasAVX() && IsOneUse && | |||
| 47914 | (NumEltBits == 32 || NumEltBits == 64)) { | |||
| 47915 | SDLoc DL(EFLAGS); | |||
| 47916 | MVT FloatSVT = MVT::getFloatingPointVT(NumEltBits); | |||
| 47917 | MVT FloatVT = MVT::getVectorVT(FloatSVT, NumElts); | |||
| 47918 | MVT IntVT = FloatVT.changeVectorElementTypeToInteger(); | |||
| 47919 | SDValue LHS = Vec; | |||
| 47920 | SDValue RHS = IsAnyOf ? Vec : DAG.getAllOnesConstant(DL, IntVT); | |||
| 47921 | CC = IsAnyOf ? CC : (CC == X86::COND_E ? X86::COND_B : X86::COND_AE); | |||
| 47922 | return DAG.getNode(X86ISD::TESTP, DL, MVT::i32, | |||
| 47923 | DAG.getBitcast(FloatVT, LHS), | |||
| 47924 | DAG.getBitcast(FloatVT, RHS)); | |||
| 47925 | } | |||
| 47926 | ||||
| 47927 | return SDValue(); | |||
| 47928 | } | |||
| 47929 | ||||
| 47930 | /// Optimize an EFLAGS definition used according to the condition code \p CC | |||
| 47931 | /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing | |||
| 47932 | /// uses of chain values. | |||
| 47933 | static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, | |||
| 47934 | SelectionDAG &DAG, | |||
| 47935 | const X86Subtarget &Subtarget) { | |||
| 47936 | if (CC == X86::COND_B) | |||
| 47937 | if (SDValue Flags = combineCarryThroughADD(EFLAGS, DAG)) | |||
| 47938 | return Flags; | |||
| 47939 | ||||
| 47940 | if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) | |||
| 47941 | return R; | |||
| 47942 | ||||
| 47943 | if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget)) | |||
| 47944 | return R; | |||
| 47945 | ||||
| 47946 | if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget)) | |||
| 47947 | return R; | |||
| 47948 | ||||
| 47949 | return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); | |||
| 47950 | } | |||
| 47951 | ||||
| 47952 | /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] | |||
| 47953 | static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, | |||
| 47954 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 47955 | const X86Subtarget &Subtarget) { | |||
| 47956 | SDLoc DL(N); | |||
| 47957 | ||||
| 47958 | SDValue FalseOp = N->getOperand(0); | |||
| 47959 | SDValue TrueOp = N->getOperand(1); | |||
| 47960 | X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); | |||
| 47961 | SDValue Cond = N->getOperand(3); | |||
| 47962 | ||||
| 47963 | // cmov X, X, ?, ? --> X | |||
| 47964 | if (TrueOp == FalseOp) | |||
| 47965 | return TrueOp; | |||
| 47966 | ||||
| 47967 | // Try to simplify the EFLAGS and condition code operands. | |||
| 47968 | // We can't always do this as FCMOV only supports a subset of X86 cond. | |||
| 47969 | if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { | |||
| 47970 | if (!(FalseOp.getValueType() == MVT::f80 || | |||
| 47971 | (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) || | |||
| 47972 | (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) || | |||
| 47973 | !Subtarget.canUseCMOV() || hasFPCMov(CC)) { | |||
| 47974 | SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8), | |||
| 47975 | Flags}; | |||
| 47976 | return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); | |||
| 47977 | } | |||
| 47978 | } | |||
| 47979 | ||||
| 47980 | // If this is a select between two integer constants, try to do some | |||
| 47981 | // optimizations. Note that the operands are ordered the opposite of SELECT | |||
| 47982 | // operands. | |||
| 47983 | if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) { | |||
| 47984 | if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) { | |||
| 47985 | // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is | |||
| 47986 | // larger than FalseC (the false value). | |||
| 47987 | if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) { | |||
| 47988 | CC = X86::GetOppositeBranchCondition(CC); | |||
| 47989 | std::swap(TrueC, FalseC); | |||
| 47990 | std::swap(TrueOp, FalseOp); | |||
| 47991 | } | |||
| 47992 | ||||
| 47993 | // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0. | |||
| 47994 | // This is efficient for any integer data type (including i8/i16) and | |||
| 47995 | // shift amount. | |||
| 47996 | if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { | |||
| 47997 | Cond = getSETCC(CC, Cond, DL, DAG); | |||
| 47998 | ||||
| 47999 | // Zero extend the condition if needed. | |||
| 48000 | Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond); | |||
| 48001 | ||||
| 48002 | unsigned ShAmt = TrueC->getAPIntValue().logBase2(); | |||
| 48003 | Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, | |||
| 48004 | DAG.getConstant(ShAmt, DL, MVT::i8)); | |||
| 48005 | return Cond; | |||
| 48006 | } | |||
| 48007 | ||||
| 48008 | // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient | |||
| 48009 | // for any integer data type, including i8/i16. | |||
| 48010 | if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) { | |||
| 48011 | Cond = getSETCC(CC, Cond, DL, DAG); | |||
| 48012 | ||||
| 48013 | // Zero extend the condition if needed. | |||
| 48014 | Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, | |||
| 48015 | FalseC->getValueType(0), Cond); | |||
| 48016 | Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, | |||
| 48017 | SDValue(FalseC, 0)); | |||
| 48018 | return Cond; | |||
| 48019 | } | |||
| 48020 | ||||
| 48021 | // Optimize cases that will turn into an LEA instruction. This requires | |||
| 48022 | // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). | |||
| 48023 | if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { | |||
| 48024 | APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); | |||
| 48025 | assert(Diff.getBitWidth() == N->getValueType(0).getSizeInBits() &&(static_cast <bool> (Diff.getBitWidth() == N->getValueType (0).getSizeInBits() && "Implicit constant truncation" ) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__ __PRETTY_FUNCTION__)) | |||
| 48026 | "Implicit constant truncation")(static_cast <bool> (Diff.getBitWidth() == N->getValueType (0).getSizeInBits() && "Implicit constant truncation" ) ? void (0) : __assert_fail ("Diff.getBitWidth() == N->getValueType(0).getSizeInBits() && \"Implicit constant truncation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48026, __extension__ __PRETTY_FUNCTION__)); | |||
| 48027 | ||||
| 48028 | bool isFastMultiplier = false; | |||
| 48029 | if (Diff.ult(10)) { | |||
| 48030 | switch (Diff.getZExtValue()) { | |||
| 48031 | default: break; | |||
| 48032 | case 1: // result = add base, cond | |||
| 48033 | case 2: // result = lea base( , cond*2) | |||
| 48034 | case 3: // result = lea base(cond, cond*2) | |||
| 48035 | case 4: // result = lea base( , cond*4) | |||
| 48036 | case 5: // result = lea base(cond, cond*4) | |||
| 48037 | case 8: // result = lea base( , cond*8) | |||
| 48038 | case 9: // result = lea base(cond, cond*8) | |||
| 48039 | isFastMultiplier = true; | |||
| 48040 | break; | |||
| 48041 | } | |||
| 48042 | } | |||
| 48043 | ||||
| 48044 | if (isFastMultiplier) { | |||
| 48045 | Cond = getSETCC(CC, Cond, DL ,DAG); | |||
| 48046 | // Zero extend the condition if needed. | |||
| 48047 | Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), | |||
| 48048 | Cond); | |||
| 48049 | // Scale the condition by the difference. | |||
| 48050 | if (Diff != 1) | |||
| 48051 | Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, | |||
| 48052 | DAG.getConstant(Diff, DL, Cond.getValueType())); | |||
| 48053 | ||||
| 48054 | // Add the base if non-zero. | |||
| 48055 | if (FalseC->getAPIntValue() != 0) | |||
| 48056 | Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, | |||
| 48057 | SDValue(FalseC, 0)); | |||
| 48058 | return Cond; | |||
| 48059 | } | |||
| 48060 | } | |||
| 48061 | } | |||
| 48062 | } | |||
| 48063 | ||||
| 48064 | // Handle these cases: | |||
| 48065 | // (select (x != c), e, c) -> select (x != c), e, x), | |||
| 48066 | // (select (x == c), c, e) -> select (x == c), x, e) | |||
| 48067 | // where the c is an integer constant, and the "select" is the combination | |||
| 48068 | // of CMOV and CMP. | |||
| 48069 | // | |||
| 48070 | // The rationale for this change is that the conditional-move from a constant | |||
| 48071 | // needs two instructions, however, conditional-move from a register needs | |||
| 48072 | // only one instruction. | |||
| 48073 | // | |||
| 48074 | // CAVEAT: By replacing a constant with a symbolic value, it may obscure | |||
| 48075 | // some instruction-combining opportunities. This opt needs to be | |||
| 48076 | // postponed as late as possible. | |||
| 48077 | // | |||
| 48078 | if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) { | |||
| 48079 | // the DCI.xxxx conditions are provided to postpone the optimization as | |||
| 48080 | // late as possible. | |||
| 48081 | ||||
| 48082 | ConstantSDNode *CmpAgainst = nullptr; | |||
| 48083 | if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) && | |||
| 48084 | (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) && | |||
| 48085 | !isa<ConstantSDNode>(Cond.getOperand(0))) { | |||
| 48086 | ||||
| 48087 | if (CC == X86::COND_NE && | |||
| 48088 | CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) { | |||
| 48089 | CC = X86::GetOppositeBranchCondition(CC); | |||
| 48090 | std::swap(TrueOp, FalseOp); | |||
| 48091 | } | |||
| 48092 | ||||
| 48093 | if (CC == X86::COND_E && | |||
| 48094 | CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { | |||
| 48095 | SDValue Ops[] = {FalseOp, Cond.getOperand(0), | |||
| 48096 | DAG.getTargetConstant(CC, DL, MVT::i8), Cond}; | |||
| 48097 | return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); | |||
| 48098 | } | |||
| 48099 | } | |||
| 48100 | } | |||
| 48101 | ||||
| 48102 | // Transform: | |||
| 48103 | // | |||
| 48104 | // (cmov 1 T (uge T 2)) | |||
| 48105 | // | |||
| 48106 | // to: | |||
| 48107 | // | |||
| 48108 | // (adc T 0 (sub T 1)) | |||
| 48109 | if (CC == X86::COND_AE && isOneConstant(FalseOp) && | |||
| 48110 | Cond.getOpcode() == X86ISD::SUB && Cond->hasOneUse()) { | |||
| 48111 | SDValue Cond0 = Cond.getOperand(0); | |||
| 48112 | if (Cond0.getOpcode() == ISD::TRUNCATE) | |||
| 48113 | Cond0 = Cond0.getOperand(0); | |||
| 48114 | auto *Sub1C = dyn_cast<ConstantSDNode>(Cond.getOperand(1)); | |||
| 48115 | if (Cond0 == TrueOp && Sub1C && Sub1C->getZExtValue() == 2) { | |||
| 48116 | EVT CondVT = Cond->getValueType(0); | |||
| 48117 | EVT OuterVT = N->getValueType(0); | |||
| 48118 | // Subtract 1 and generate a carry. | |||
| 48119 | SDValue NewSub = | |||
| 48120 | DAG.getNode(X86ISD::SUB, DL, Cond->getVTList(), Cond.getOperand(0), | |||
| 48121 | DAG.getConstant(1, DL, CondVT)); | |||
| 48122 | SDValue EFLAGS(NewSub.getNode(), 1); | |||
| 48123 | return DAG.getNode(X86ISD::ADC, DL, DAG.getVTList(OuterVT, MVT::i32), | |||
| 48124 | TrueOp, DAG.getConstant(0, DL, OuterVT), EFLAGS); | |||
| 48125 | } | |||
| 48126 | } | |||
| 48127 | ||||
| 48128 | // Fold and/or of setcc's to double CMOV: | |||
| 48129 | // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2) | |||
| 48130 | // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2) | |||
| 48131 | // | |||
| 48132 | // This combine lets us generate: | |||
| 48133 | // cmovcc1 (jcc1 if we don't have CMOV) | |||
| 48134 | // cmovcc2 (same) | |||
| 48135 | // instead of: | |||
| 48136 | // setcc1 | |||
| 48137 | // setcc2 | |||
| 48138 | // and/or | |||
| 48139 | // cmovne (jne if we don't have CMOV) | |||
| 48140 | // When we can't use the CMOV instruction, it might increase branch | |||
| 48141 | // mispredicts. | |||
| 48142 | // When we can use CMOV, or when there is no mispredict, this improves | |||
| 48143 | // throughput and reduces register pressure. | |||
| 48144 | // | |||
| 48145 | if (CC == X86::COND_NE) { | |||
| 48146 | SDValue Flags; | |||
| 48147 | X86::CondCode CC0, CC1; | |||
| 48148 | bool isAndSetCC; | |||
| 48149 | if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) { | |||
| 48150 | if (isAndSetCC) { | |||
| 48151 | std::swap(FalseOp, TrueOp); | |||
| 48152 | CC0 = X86::GetOppositeBranchCondition(CC0); | |||
| 48153 | CC1 = X86::GetOppositeBranchCondition(CC1); | |||
| 48154 | } | |||
| 48155 | ||||
| 48156 | SDValue LOps[] = {FalseOp, TrueOp, | |||
| 48157 | DAG.getTargetConstant(CC0, DL, MVT::i8), Flags}; | |||
| 48158 | SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); | |||
| 48159 | SDValue Ops[] = {LCMOV, TrueOp, DAG.getTargetConstant(CC1, DL, MVT::i8), | |||
| 48160 | Flags}; | |||
| 48161 | SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); | |||
| 48162 | return CMOV; | |||
| 48163 | } | |||
| 48164 | } | |||
| 48165 | ||||
| 48166 | // Fold (CMOV C1, (ADD (CTTZ X), C2), (X != 0)) -> | |||
| 48167 | // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) | |||
| 48168 | // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> | |||
| 48169 | // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) | |||
| 48170 | if ((CC == X86::COND_NE || CC == X86::COND_E) && | |||
| 48171 | Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { | |||
| 48172 | SDValue Add = TrueOp; | |||
| 48173 | SDValue Const = FalseOp; | |||
| 48174 | // Canonicalize the condition code for easier matching and output. | |||
| 48175 | if (CC == X86::COND_E) | |||
| 48176 | std::swap(Add, Const); | |||
| 48177 | ||||
| 48178 | // We might have replaced the constant in the cmov with the LHS of the | |||
| 48179 | // compare. If so change it to the RHS of the compare. | |||
| 48180 | if (Const == Cond.getOperand(0)) | |||
| 48181 | Const = Cond.getOperand(1); | |||
| 48182 | ||||
| 48183 | // Ok, now make sure that Add is (add (cttz X), C2) and Const is a constant. | |||
| 48184 | if (isa<ConstantSDNode>(Const) && Add.getOpcode() == ISD::ADD && | |||
| 48185 | Add.hasOneUse() && isa<ConstantSDNode>(Add.getOperand(1)) && | |||
| 48186 | (Add.getOperand(0).getOpcode() == ISD::CTTZ_ZERO_UNDEF || | |||
| 48187 | Add.getOperand(0).getOpcode() == ISD::CTTZ) && | |||
| 48188 | Add.getOperand(0).getOperand(0) == Cond.getOperand(0)) { | |||
| 48189 | EVT VT = N->getValueType(0); | |||
| 48190 | // This should constant fold. | |||
| 48191 | SDValue Diff = DAG.getNode(ISD::SUB, DL, VT, Const, Add.getOperand(1)); | |||
| 48192 | SDValue CMov = | |||
| 48193 | DAG.getNode(X86ISD::CMOV, DL, VT, Diff, Add.getOperand(0), | |||
| 48194 | DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8), Cond); | |||
| 48195 | return DAG.getNode(ISD::ADD, DL, VT, CMov, Add.getOperand(1)); | |||
| 48196 | } | |||
| 48197 | } | |||
| 48198 | ||||
| 48199 | return SDValue(); | |||
| 48200 | } | |||
| 48201 | ||||
| 48202 | /// Different mul shrinking modes. | |||
| 48203 | enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 }; | |||
| 48204 | ||||
| 48205 | static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) { | |||
| 48206 | EVT VT = N->getOperand(0).getValueType(); | |||
| 48207 | if (VT.getScalarSizeInBits() != 32) | |||
| 48208 | return false; | |||
| 48209 | ||||
| 48210 | assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2")(static_cast <bool> (N->getNumOperands() == 2 && "NumOperands of Mul are 2") ? void (0) : __assert_fail ("N->getNumOperands() == 2 && \"NumOperands of Mul are 2\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48210, __extension__ __PRETTY_FUNCTION__)); | |||
| 48211 | unsigned SignBits[2] = {1, 1}; | |||
| 48212 | bool IsPositive[2] = {false, false}; | |||
| 48213 | for (unsigned i = 0; i < 2; i++) { | |||
| 48214 | SDValue Opd = N->getOperand(i); | |||
| 48215 | ||||
| 48216 | SignBits[i] = DAG.ComputeNumSignBits(Opd); | |||
| 48217 | IsPositive[i] = DAG.SignBitIsZero(Opd); | |||
| 48218 | } | |||
| 48219 | ||||
| 48220 | bool AllPositive = IsPositive[0] && IsPositive[1]; | |||
| 48221 | unsigned MinSignBits = std::min(SignBits[0], SignBits[1]); | |||
| 48222 | // When ranges are from -128 ~ 127, use MULS8 mode. | |||
| 48223 | if (MinSignBits >= 25) | |||
| 48224 | Mode = ShrinkMode::MULS8; | |||
| 48225 | // When ranges are from 0 ~ 255, use MULU8 mode. | |||
| 48226 | else if (AllPositive && MinSignBits >= 24) | |||
| 48227 | Mode = ShrinkMode::MULU8; | |||
| 48228 | // When ranges are from -32768 ~ 32767, use MULS16 mode. | |||
| 48229 | else if (MinSignBits >= 17) | |||
| 48230 | Mode = ShrinkMode::MULS16; | |||
| 48231 | // When ranges are from 0 ~ 65535, use MULU16 mode. | |||
| 48232 | else if (AllPositive && MinSignBits >= 16) | |||
| 48233 | Mode = ShrinkMode::MULU16; | |||
| 48234 | else | |||
| 48235 | return false; | |||
| 48236 | return true; | |||
| 48237 | } | |||
| 48238 | ||||
| 48239 | /// When the operands of vector mul are extended from smaller size values, | |||
| 48240 | /// like i8 and i16, the type of mul may be shrinked to generate more | |||
| 48241 | /// efficient code. Two typical patterns are handled: | |||
| 48242 | /// Pattern1: | |||
| 48243 | /// %2 = sext/zext <N x i8> %1 to <N x i32> | |||
| 48244 | /// %4 = sext/zext <N x i8> %3 to <N x i32> | |||
| 48245 | // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) | |||
| 48246 | /// %5 = mul <N x i32> %2, %4 | |||
| 48247 | /// | |||
| 48248 | /// Pattern2: | |||
| 48249 | /// %2 = zext/sext <N x i16> %1 to <N x i32> | |||
| 48250 | /// %4 = zext/sext <N x i16> %3 to <N x i32> | |||
| 48251 | /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants) | |||
| 48252 | /// %5 = mul <N x i32> %2, %4 | |||
| 48253 | /// | |||
| 48254 | /// There are four mul shrinking modes: | |||
| 48255 | /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is | |||
| 48256 | /// -128 to 128, and the scalar value range of %4 is also -128 to 128, | |||
| 48257 | /// generate pmullw+sext32 for it (MULS8 mode). | |||
| 48258 | /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is | |||
| 48259 | /// 0 to 255, and the scalar value range of %4 is also 0 to 255, | |||
| 48260 | /// generate pmullw+zext32 for it (MULU8 mode). | |||
| 48261 | /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is | |||
| 48262 | /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767, | |||
| 48263 | /// generate pmullw+pmulhw for it (MULS16 mode). | |||
| 48264 | /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is | |||
| 48265 | /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535, | |||
| 48266 | /// generate pmullw+pmulhuw for it (MULU16 mode). | |||
| 48267 | static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, | |||
| 48268 | const X86Subtarget &Subtarget) { | |||
| 48269 | // Check for legality | |||
| 48270 | // pmullw/pmulhw are not supported by SSE. | |||
| 48271 | if (!Subtarget.hasSSE2()) | |||
| 48272 | return SDValue(); | |||
| 48273 | ||||
| 48274 | // Check for profitability | |||
| 48275 | // pmulld is supported since SSE41. It is better to use pmulld | |||
| 48276 | // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than | |||
| 48277 | // the expansion. | |||
| 48278 | bool OptForMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); | |||
| 48279 | if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) | |||
| 48280 | return SDValue(); | |||
| 48281 | ||||
| 48282 | ShrinkMode Mode; | |||
| 48283 | if (!canReduceVMulWidth(N, DAG, Mode)) | |||
| 48284 | return SDValue(); | |||
| 48285 | ||||
| 48286 | SDLoc DL(N); | |||
| 48287 | SDValue N0 = N->getOperand(0); | |||
| 48288 | SDValue N1 = N->getOperand(1); | |||
| 48289 | EVT VT = N->getOperand(0).getValueType(); | |||
| 48290 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 48291 | if ((NumElts % 2) != 0) | |||
| 48292 | return SDValue(); | |||
| 48293 | ||||
| 48294 | EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); | |||
| 48295 | ||||
| 48296 | // Shrink the operands of mul. | |||
| 48297 | SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); | |||
| 48298 | SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); | |||
| 48299 | ||||
| 48300 | // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the | |||
| 48301 | // lower part is needed. | |||
| 48302 | SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); | |||
| 48303 | if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8) | |||
| 48304 | return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND | |||
| 48305 | : ISD::SIGN_EXTEND, | |||
| 48306 | DL, VT, MulLo); | |||
| 48307 | ||||
| 48308 | EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2); | |||
| 48309 | // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, | |||
| 48310 | // the higher part is also needed. | |||
| 48311 | SDValue MulHi = | |||
| 48312 | DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL, | |||
| 48313 | ReducedVT, NewN0, NewN1); | |||
| 48314 | ||||
| 48315 | // Repack the lower part and higher part result of mul into a wider | |||
| 48316 | // result. | |||
| 48317 | // Generate shuffle functioning as punpcklwd. | |||
| 48318 | SmallVector<int, 16> ShuffleMask(NumElts); | |||
| 48319 | for (unsigned i = 0, e = NumElts / 2; i < e; i++) { | |||
| 48320 | ShuffleMask[2 * i] = i; | |||
| 48321 | ShuffleMask[2 * i + 1] = i + NumElts; | |||
| 48322 | } | |||
| 48323 | SDValue ResLo = | |||
| 48324 | DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); | |||
| 48325 | ResLo = DAG.getBitcast(ResVT, ResLo); | |||
| 48326 | // Generate shuffle functioning as punpckhwd. | |||
| 48327 | for (unsigned i = 0, e = NumElts / 2; i < e; i++) { | |||
| 48328 | ShuffleMask[2 * i] = i + NumElts / 2; | |||
| 48329 | ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; | |||
| 48330 | } | |||
| 48331 | SDValue ResHi = | |||
| 48332 | DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); | |||
| 48333 | ResHi = DAG.getBitcast(ResVT, ResHi); | |||
| 48334 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); | |||
| 48335 | } | |||
| 48336 | ||||
| 48337 | static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG, | |||
| 48338 | EVT VT, const SDLoc &DL) { | |||
| 48339 | ||||
| 48340 | auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) { | |||
| 48341 | SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), | |||
| 48342 | DAG.getConstant(Mult, DL, VT)); | |||
| 48343 | Result = DAG.getNode(ISD::SHL, DL, VT, Result, | |||
| 48344 | DAG.getConstant(Shift, DL, MVT::i8)); | |||
| 48345 | Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, | |||
| 48346 | N->getOperand(0)); | |||
| 48347 | return Result; | |||
| 48348 | }; | |||
| 48349 | ||||
| 48350 | auto combineMulMulAddOrSub = [&](int Mul1, int Mul2, bool isAdd) { | |||
| 48351 | SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), | |||
| 48352 | DAG.getConstant(Mul1, DL, VT)); | |||
| 48353 | Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, Result, | |||
| 48354 | DAG.getConstant(Mul2, DL, VT)); | |||
| 48355 | Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result, | |||
| 48356 | N->getOperand(0)); | |||
| 48357 | return Result; | |||
| 48358 | }; | |||
| 48359 | ||||
| 48360 | switch (MulAmt) { | |||
| 48361 | default: | |||
| 48362 | break; | |||
| 48363 | case 11: | |||
| 48364 | // mul x, 11 => add ((shl (mul x, 5), 1), x) | |||
| 48365 | return combineMulShlAddOrSub(5, 1, /*isAdd*/ true); | |||
| 48366 | case 21: | |||
| 48367 | // mul x, 21 => add ((shl (mul x, 5), 2), x) | |||
| 48368 | return combineMulShlAddOrSub(5, 2, /*isAdd*/ true); | |||
| 48369 | case 41: | |||
| 48370 | // mul x, 41 => add ((shl (mul x, 5), 3), x) | |||
| 48371 | return combineMulShlAddOrSub(5, 3, /*isAdd*/ true); | |||
| 48372 | case 22: | |||
| 48373 | // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x) | |||
| 48374 | return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), | |||
| 48375 | combineMulShlAddOrSub(5, 2, /*isAdd*/ true)); | |||
| 48376 | case 19: | |||
| 48377 | // mul x, 19 => add ((shl (mul x, 9), 1), x) | |||
| 48378 | return combineMulShlAddOrSub(9, 1, /*isAdd*/ true); | |||
| 48379 | case 37: | |||
| 48380 | // mul x, 37 => add ((shl (mul x, 9), 2), x) | |||
| 48381 | return combineMulShlAddOrSub(9, 2, /*isAdd*/ true); | |||
| 48382 | case 73: | |||
| 48383 | // mul x, 73 => add ((shl (mul x, 9), 3), x) | |||
| 48384 | return combineMulShlAddOrSub(9, 3, /*isAdd*/ true); | |||
| 48385 | case 13: | |||
| 48386 | // mul x, 13 => add ((shl (mul x, 3), 2), x) | |||
| 48387 | return combineMulShlAddOrSub(3, 2, /*isAdd*/ true); | |||
| 48388 | case 23: | |||
| 48389 | // mul x, 23 => sub ((shl (mul x, 3), 3), x) | |||
| 48390 | return combineMulShlAddOrSub(3, 3, /*isAdd*/ false); | |||
| 48391 | case 26: | |||
| 48392 | // mul x, 26 => add ((mul (mul x, 5), 5), x) | |||
| 48393 | return combineMulMulAddOrSub(5, 5, /*isAdd*/ true); | |||
| 48394 | case 28: | |||
| 48395 | // mul x, 28 => add ((mul (mul x, 9), 3), x) | |||
| 48396 | return combineMulMulAddOrSub(9, 3, /*isAdd*/ true); | |||
| 48397 | case 29: | |||
| 48398 | // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x) | |||
| 48399 | return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), | |||
| 48400 | combineMulMulAddOrSub(9, 3, /*isAdd*/ true)); | |||
| 48401 | } | |||
| 48402 | ||||
| 48403 | // Another trick. If this is a power 2 + 2/4/8, we can use a shift followed | |||
| 48404 | // by a single LEA. | |||
| 48405 | // First check if this a sum of two power of 2s because that's easy. Then | |||
| 48406 | // count how many zeros are up to the first bit. | |||
| 48407 | // TODO: We can do this even without LEA at a cost of two shifts and an add. | |||
| 48408 | if (isPowerOf2_64(MulAmt & (MulAmt - 1))) { | |||
| 48409 | unsigned ScaleShift = llvm::countr_zero(MulAmt); | |||
| 48410 | if (ScaleShift >= 1 && ScaleShift < 4) { | |||
| 48411 | unsigned ShiftAmt = Log2_64((MulAmt & (MulAmt - 1))); | |||
| 48412 | SDValue Shift1 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48413 | DAG.getConstant(ShiftAmt, DL, MVT::i8)); | |||
| 48414 | SDValue Shift2 = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48415 | DAG.getConstant(ScaleShift, DL, MVT::i8)); | |||
| 48416 | return DAG.getNode(ISD::ADD, DL, VT, Shift1, Shift2); | |||
| 48417 | } | |||
| 48418 | } | |||
| 48419 | ||||
| 48420 | return SDValue(); | |||
| 48421 | } | |||
| 48422 | ||||
| 48423 | // If the upper 17 bits of either element are zero and the other element are | |||
| 48424 | // zero/sign bits then we can use PMADDWD, which is always at least as quick as | |||
| 48425 | // PMULLD, except on KNL. | |||
| 48426 | static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG, | |||
| 48427 | const X86Subtarget &Subtarget) { | |||
| 48428 | if (!Subtarget.hasSSE2()) | |||
| 48429 | return SDValue(); | |||
| 48430 | ||||
| 48431 | if (Subtarget.isPMADDWDSlow()) | |||
| 48432 | return SDValue(); | |||
| 48433 | ||||
| 48434 | EVT VT = N->getValueType(0); | |||
| 48435 | ||||
| 48436 | // Only support vXi32 vectors. | |||
| 48437 | if (!VT.isVector() || VT.getVectorElementType() != MVT::i32) | |||
| 48438 | return SDValue(); | |||
| 48439 | ||||
| 48440 | // Make sure the type is legal or can split/widen to a legal type. | |||
| 48441 | // With AVX512 but without BWI, we would need to split v32i16. | |||
| 48442 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 48443 | if (NumElts == 1 || !isPowerOf2_32(NumElts)) | |||
| 48444 | return SDValue(); | |||
| 48445 | ||||
| 48446 | // With AVX512 but without BWI, we would need to split v32i16. | |||
| 48447 | if (32 <= (2 * NumElts) && Subtarget.hasAVX512() && !Subtarget.hasBWI()) | |||
| 48448 | return SDValue(); | |||
| 48449 | ||||
| 48450 | SDValue N0 = N->getOperand(0); | |||
| 48451 | SDValue N1 = N->getOperand(1); | |||
| 48452 | ||||
| 48453 | // If we are zero/sign extending two steps without SSE4.1, its better to | |||
| 48454 | // reduce the vmul width instead. | |||
| 48455 | if (!Subtarget.hasSSE41() && | |||
| 48456 | (((N0.getOpcode() == ISD::ZERO_EXTEND && | |||
| 48457 | N0.getOperand(0).getScalarValueSizeInBits() <= 8) && | |||
| 48458 | (N1.getOpcode() == ISD::ZERO_EXTEND && | |||
| 48459 | N1.getOperand(0).getScalarValueSizeInBits() <= 8)) || | |||
| 48460 | ((N0.getOpcode() == ISD::SIGN_EXTEND && | |||
| 48461 | N0.getOperand(0).getScalarValueSizeInBits() <= 8) && | |||
| 48462 | (N1.getOpcode() == ISD::SIGN_EXTEND && | |||
| 48463 | N1.getOperand(0).getScalarValueSizeInBits() <= 8)))) | |||
| 48464 | return SDValue(); | |||
| 48465 | ||||
| 48466 | // If we are sign extending a wide vector without SSE4.1, its better to reduce | |||
| 48467 | // the vmul width instead. | |||
| 48468 | if (!Subtarget.hasSSE41() && | |||
| 48469 | (N0.getOpcode() == ISD::SIGN_EXTEND && | |||
| 48470 | N0.getOperand(0).getValueSizeInBits() > 128) && | |||
| 48471 | (N1.getOpcode() == ISD::SIGN_EXTEND && | |||
| 48472 | N1.getOperand(0).getValueSizeInBits() > 128)) | |||
| 48473 | return SDValue(); | |||
| 48474 | ||||
| 48475 | // Sign bits must extend down to the lowest i16. | |||
| 48476 | if (DAG.ComputeMaxSignificantBits(N1) > 16 || | |||
| 48477 | DAG.ComputeMaxSignificantBits(N0) > 16) | |||
| 48478 | return SDValue(); | |||
| 48479 | ||||
| 48480 | // At least one of the elements must be zero in the upper 17 bits, or can be | |||
| 48481 | // safely made zero without altering the final result. | |||
| 48482 | auto GetZeroableOp = [&](SDValue Op) { | |||
| 48483 | APInt Mask17 = APInt::getHighBitsSet(32, 17); | |||
| 48484 | if (DAG.MaskedValueIsZero(Op, Mask17)) | |||
| 48485 | return Op; | |||
| 48486 | // Mask off upper 16-bits of sign-extended constants. | |||
| 48487 | if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) | |||
| 48488 | return DAG.getNode(ISD::AND, SDLoc(N), VT, Op, | |||
| 48489 | DAG.getConstant(0xFFFF, SDLoc(N), VT)); | |||
| 48490 | if (Op.getOpcode() == ISD::SIGN_EXTEND && N->isOnlyUserOf(Op.getNode())) { | |||
| 48491 | SDValue Src = Op.getOperand(0); | |||
| 48492 | // Convert sext(vXi16) to zext(vXi16). | |||
| 48493 | if (Src.getScalarValueSizeInBits() == 16 && VT.getSizeInBits() <= 128) | |||
| 48494 | return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); | |||
| 48495 | // Convert sext(vXi8) to zext(vXi16 sext(vXi8)) on pre-SSE41 targets | |||
| 48496 | // which will expand the extension. | |||
| 48497 | if (Src.getScalarValueSizeInBits() < 16 && !Subtarget.hasSSE41()) { | |||
| 48498 | EVT ExtVT = VT.changeVectorElementType(MVT::i16); | |||
| 48499 | Src = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), ExtVT, Src); | |||
| 48500 | return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Src); | |||
| 48501 | } | |||
| 48502 | } | |||
| 48503 | // Convert SIGN_EXTEND_VECTOR_INREG to ZEXT_EXTEND_VECTOR_INREG. | |||
| 48504 | if (Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG && | |||
| 48505 | N->isOnlyUserOf(Op.getNode())) { | |||
| 48506 | SDValue Src = Op.getOperand(0); | |||
| 48507 | if (Src.getScalarValueSizeInBits() == 16) | |||
| 48508 | return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src); | |||
| 48509 | } | |||
| 48510 | // Convert VSRAI(Op, 16) to VSRLI(Op, 16). | |||
| 48511 | if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 && | |||
| 48512 | N->isOnlyUserOf(Op.getNode())) { | |||
| 48513 | return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0), | |||
| 48514 | Op.getOperand(1)); | |||
| 48515 | } | |||
| 48516 | return SDValue(); | |||
| 48517 | }; | |||
| 48518 | SDValue ZeroN0 = GetZeroableOp(N0); | |||
| 48519 | SDValue ZeroN1 = GetZeroableOp(N1); | |||
| 48520 | if (!ZeroN0 && !ZeroN1) | |||
| 48521 | return SDValue(); | |||
| 48522 | N0 = ZeroN0 ? ZeroN0 : N0; | |||
| 48523 | N1 = ZeroN1 ? ZeroN1 : N1; | |||
| 48524 | ||||
| 48525 | // Use SplitOpsAndApply to handle AVX splitting. | |||
| 48526 | auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 48527 | ArrayRef<SDValue> Ops) { | |||
| 48528 | MVT ResVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32); | |||
| 48529 | MVT OpVT = MVT::getVectorVT(MVT::i16, Ops[0].getValueSizeInBits() / 16); | |||
| 48530 | return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, | |||
| 48531 | DAG.getBitcast(OpVT, Ops[0]), | |||
| 48532 | DAG.getBitcast(OpVT, Ops[1])); | |||
| 48533 | }; | |||
| 48534 | return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, {N0, N1}, | |||
| 48535 | PMADDWDBuilder); | |||
| 48536 | } | |||
| 48537 | ||||
| 48538 | static SDValue combineMulToPMULDQ(SDNode *N, SelectionDAG &DAG, | |||
| 48539 | const X86Subtarget &Subtarget) { | |||
| 48540 | if (!Subtarget.hasSSE2()) | |||
| 48541 | return SDValue(); | |||
| 48542 | ||||
| 48543 | EVT VT = N->getValueType(0); | |||
| 48544 | ||||
| 48545 | // Only support vXi64 vectors. | |||
| 48546 | if (!VT.isVector() || VT.getVectorElementType() != MVT::i64 || | |||
| 48547 | VT.getVectorNumElements() < 2 || | |||
| 48548 | !isPowerOf2_32(VT.getVectorNumElements())) | |||
| 48549 | return SDValue(); | |||
| 48550 | ||||
| 48551 | SDValue N0 = N->getOperand(0); | |||
| 48552 | SDValue N1 = N->getOperand(1); | |||
| 48553 | ||||
| 48554 | // MULDQ returns the 64-bit result of the signed multiplication of the lower | |||
| 48555 | // 32-bits. We can lower with this if the sign bits stretch that far. | |||
| 48556 | if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(N0) > 32 && | |||
| 48557 | DAG.ComputeNumSignBits(N1) > 32) { | |||
| 48558 | auto PMULDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 48559 | ArrayRef<SDValue> Ops) { | |||
| 48560 | return DAG.getNode(X86ISD::PMULDQ, DL, Ops[0].getValueType(), Ops); | |||
| 48561 | }; | |||
| 48562 | return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, | |||
| 48563 | PMULDQBuilder, /*CheckBWI*/false); | |||
| 48564 | } | |||
| 48565 | ||||
| 48566 | // If the upper bits are zero we can use a single pmuludq. | |||
| 48567 | APInt Mask = APInt::getHighBitsSet(64, 32); | |||
| 48568 | if (DAG.MaskedValueIsZero(N0, Mask) && DAG.MaskedValueIsZero(N1, Mask)) { | |||
| 48569 | auto PMULUDQBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 48570 | ArrayRef<SDValue> Ops) { | |||
| 48571 | return DAG.getNode(X86ISD::PMULUDQ, DL, Ops[0].getValueType(), Ops); | |||
| 48572 | }; | |||
| 48573 | return SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, { N0, N1 }, | |||
| 48574 | PMULUDQBuilder, /*CheckBWI*/false); | |||
| 48575 | } | |||
| 48576 | ||||
| 48577 | return SDValue(); | |||
| 48578 | } | |||
| 48579 | ||||
| 48580 | static SDValue combineMul(SDNode *N, SelectionDAG &DAG, | |||
| 48581 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 48582 | const X86Subtarget &Subtarget) { | |||
| 48583 | EVT VT = N->getValueType(0); | |||
| 48584 | ||||
| 48585 | if (SDValue V = combineMulToPMADDWD(N, DAG, Subtarget)) | |||
| 48586 | return V; | |||
| 48587 | ||||
| 48588 | if (SDValue V = combineMulToPMULDQ(N, DAG, Subtarget)) | |||
| 48589 | return V; | |||
| 48590 | ||||
| 48591 | if (DCI.isBeforeLegalize() && VT.isVector()) | |||
| 48592 | return reduceVMULWidth(N, DAG, Subtarget); | |||
| 48593 | ||||
| 48594 | // Optimize a single multiply with constant into two operations in order to | |||
| 48595 | // implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA. | |||
| 48596 | if (!MulConstantOptimization) | |||
| 48597 | return SDValue(); | |||
| 48598 | ||||
| 48599 | // An imul is usually smaller than the alternative sequence. | |||
| 48600 | if (DAG.getMachineFunction().getFunction().hasMinSize()) | |||
| 48601 | return SDValue(); | |||
| 48602 | ||||
| 48603 | if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) | |||
| 48604 | return SDValue(); | |||
| 48605 | ||||
| 48606 | if (VT != MVT::i64 && VT != MVT::i32) | |||
| 48607 | return SDValue(); | |||
| 48608 | ||||
| 48609 | ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1)); | |||
| 48610 | if (!C) | |||
| 48611 | return SDValue(); | |||
| 48612 | if (isPowerOf2_64(C->getZExtValue())) | |||
| 48613 | return SDValue(); | |||
| 48614 | ||||
| 48615 | int64_t SignMulAmt = C->getSExtValue(); | |||
| 48616 | assert(SignMulAmt != INT64_MIN && "Int min should have been handled!")(static_cast <bool> (SignMulAmt != (-9223372036854775807L -1) && "Int min should have been handled!") ? void ( 0) : __assert_fail ("SignMulAmt != INT64_MIN && \"Int min should have been handled!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48616, __extension__ __PRETTY_FUNCTION__)); | |||
| 48617 | uint64_t AbsMulAmt = SignMulAmt < 0 ? -SignMulAmt : SignMulAmt; | |||
| 48618 | ||||
| 48619 | SDLoc DL(N); | |||
| 48620 | if (AbsMulAmt == 3 || AbsMulAmt == 5 || AbsMulAmt == 9) { | |||
| 48621 | SDValue NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), | |||
| 48622 | DAG.getConstant(AbsMulAmt, DL, VT)); | |||
| 48623 | if (SignMulAmt < 0) | |||
| 48624 | NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), | |||
| 48625 | NewMul); | |||
| 48626 | ||||
| 48627 | return NewMul; | |||
| 48628 | } | |||
| 48629 | ||||
| 48630 | uint64_t MulAmt1 = 0; | |||
| 48631 | uint64_t MulAmt2 = 0; | |||
| 48632 | if ((AbsMulAmt % 9) == 0) { | |||
| 48633 | MulAmt1 = 9; | |||
| 48634 | MulAmt2 = AbsMulAmt / 9; | |||
| 48635 | } else if ((AbsMulAmt % 5) == 0) { | |||
| 48636 | MulAmt1 = 5; | |||
| 48637 | MulAmt2 = AbsMulAmt / 5; | |||
| 48638 | } else if ((AbsMulAmt % 3) == 0) { | |||
| 48639 | MulAmt1 = 3; | |||
| 48640 | MulAmt2 = AbsMulAmt / 3; | |||
| 48641 | } | |||
| 48642 | ||||
| 48643 | SDValue NewMul; | |||
| 48644 | // For negative multiply amounts, only allow MulAmt2 to be a power of 2. | |||
| 48645 | if (MulAmt2 && | |||
| 48646 | (isPowerOf2_64(MulAmt2) || | |||
| 48647 | (SignMulAmt >= 0 && (MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)))) { | |||
| 48648 | ||||
| 48649 | if (isPowerOf2_64(MulAmt2) && | |||
| 48650 | !(SignMulAmt >= 0 && N->hasOneUse() && | |||
| 48651 | N->use_begin()->getOpcode() == ISD::ADD)) | |||
| 48652 | // If second multiplifer is pow2, issue it first. We want the multiply by | |||
| 48653 | // 3, 5, or 9 to be folded into the addressing mode unless the lone use | |||
| 48654 | // is an add. Only do this for positive multiply amounts since the | |||
| 48655 | // negate would prevent it from being used as an address mode anyway. | |||
| 48656 | std::swap(MulAmt1, MulAmt2); | |||
| 48657 | ||||
| 48658 | if (isPowerOf2_64(MulAmt1)) | |||
| 48659 | NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48660 | DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8)); | |||
| 48661 | else | |||
| 48662 | NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0), | |||
| 48663 | DAG.getConstant(MulAmt1, DL, VT)); | |||
| 48664 | ||||
| 48665 | if (isPowerOf2_64(MulAmt2)) | |||
| 48666 | NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul, | |||
| 48667 | DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8)); | |||
| 48668 | else | |||
| 48669 | NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul, | |||
| 48670 | DAG.getConstant(MulAmt2, DL, VT)); | |||
| 48671 | ||||
| 48672 | // Negate the result. | |||
| 48673 | if (SignMulAmt < 0) | |||
| 48674 | NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), | |||
| 48675 | NewMul); | |||
| 48676 | } else if (!Subtarget.slowLEA()) | |||
| 48677 | NewMul = combineMulSpecial(C->getZExtValue(), N, DAG, VT, DL); | |||
| 48678 | ||||
| 48679 | if (!NewMul) { | |||
| 48680 | assert(C->getZExtValue() != 0 &&(static_cast <bool> (C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__ __PRETTY_FUNCTION__)) | |||
| 48681 | C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&(static_cast <bool> (C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__ __PRETTY_FUNCTION__)) | |||
| 48682 | "Both cases that could cause potential overflows should have "(static_cast <bool> (C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__ __PRETTY_FUNCTION__)) | |||
| 48683 | "already been handled.")(static_cast <bool> (C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? (18446744073709551615UL ) : (4294967295U)) && "Both cases that could cause potential overflows should have " "already been handled.") ? void (0) : __assert_fail ("C->getZExtValue() != 0 && C->getZExtValue() != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) && \"Both cases that could cause potential overflows should have \" \"already been handled.\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48683, __extension__ __PRETTY_FUNCTION__)); | |||
| 48684 | if (isPowerOf2_64(AbsMulAmt - 1)) { | |||
| 48685 | // (mul x, 2^N + 1) => (add (shl x, N), x) | |||
| 48686 | NewMul = DAG.getNode( | |||
| 48687 | ISD::ADD, DL, VT, N->getOperand(0), | |||
| 48688 | DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48689 | DAG.getConstant(Log2_64(AbsMulAmt - 1), DL, | |||
| 48690 | MVT::i8))); | |||
| 48691 | // To negate, subtract the number from zero | |||
| 48692 | if (SignMulAmt < 0) | |||
| 48693 | NewMul = DAG.getNode(ISD::SUB, DL, VT, | |||
| 48694 | DAG.getConstant(0, DL, VT), NewMul); | |||
| 48695 | } else if (isPowerOf2_64(AbsMulAmt + 1)) { | |||
| 48696 | // (mul x, 2^N - 1) => (sub (shl x, N), x) | |||
| 48697 | NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48698 | DAG.getConstant(Log2_64(AbsMulAmt + 1), | |||
| 48699 | DL, MVT::i8)); | |||
| 48700 | // To negate, reverse the operands of the subtract. | |||
| 48701 | if (SignMulAmt < 0) | |||
| 48702 | NewMul = DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), NewMul); | |||
| 48703 | else | |||
| 48704 | NewMul = DAG.getNode(ISD::SUB, DL, VT, NewMul, N->getOperand(0)); | |||
| 48705 | } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt - 2)) { | |||
| 48706 | // (mul x, 2^N + 2) => (add (shl x, N), (add x, x)) | |||
| 48707 | NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48708 | DAG.getConstant(Log2_64(AbsMulAmt - 2), | |||
| 48709 | DL, MVT::i8)); | |||
| 48710 | NewMul = DAG.getNode( | |||
| 48711 | ISD::ADD, DL, VT, NewMul, | |||
| 48712 | DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); | |||
| 48713 | } else if (SignMulAmt >= 0 && isPowerOf2_64(AbsMulAmt + 2)) { | |||
| 48714 | // (mul x, 2^N - 2) => (sub (shl x, N), (add x, x)) | |||
| 48715 | NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0), | |||
| 48716 | DAG.getConstant(Log2_64(AbsMulAmt + 2), | |||
| 48717 | DL, MVT::i8)); | |||
| 48718 | NewMul = DAG.getNode( | |||
| 48719 | ISD::SUB, DL, VT, NewMul, | |||
| 48720 | DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0), N->getOperand(0))); | |||
| 48721 | } | |||
| 48722 | } | |||
| 48723 | ||||
| 48724 | return NewMul; | |||
| 48725 | } | |||
| 48726 | ||||
| 48727 | // Try to form a MULHU or MULHS node by looking for | |||
| 48728 | // (srl (mul ext, ext), 16) | |||
| 48729 | // TODO: This is X86 specific because we want to be able to handle wide types | |||
| 48730 | // before type legalization. But we can only do it if the vector will be | |||
| 48731 | // legalized via widening/splitting. Type legalization can't handle promotion | |||
| 48732 | // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG | |||
| 48733 | // combiner. | |||
| 48734 | static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG, | |||
| 48735 | const X86Subtarget &Subtarget) { | |||
| 48736 | assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&(static_cast <bool> ((N->getOpcode() == ISD::SRL || N ->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!" ) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__ __PRETTY_FUNCTION__)) | |||
| 48737 | "SRL or SRA node is required here!")(static_cast <bool> ((N->getOpcode() == ISD::SRL || N ->getOpcode() == ISD::SRA) && "SRL or SRA node is required here!" ) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) && \"SRL or SRA node is required here!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48737, __extension__ __PRETTY_FUNCTION__)); | |||
| 48738 | SDLoc DL(N); | |||
| 48739 | ||||
| 48740 | if (!Subtarget.hasSSE2()) | |||
| 48741 | return SDValue(); | |||
| 48742 | ||||
| 48743 | // The operation feeding into the shift must be a multiply. | |||
| 48744 | SDValue ShiftOperand = N->getOperand(0); | |||
| 48745 | if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse()) | |||
| 48746 | return SDValue(); | |||
| 48747 | ||||
| 48748 | // Input type should be at least vXi32. | |||
| 48749 | EVT VT = N->getValueType(0); | |||
| 48750 | if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32) | |||
| 48751 | return SDValue(); | |||
| 48752 | ||||
| 48753 | // Need a shift by 16. | |||
| 48754 | APInt ShiftAmt; | |||
| 48755 | if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) || | |||
| 48756 | ShiftAmt != 16) | |||
| 48757 | return SDValue(); | |||
| 48758 | ||||
| 48759 | SDValue LHS = ShiftOperand.getOperand(0); | |||
| 48760 | SDValue RHS = ShiftOperand.getOperand(1); | |||
| 48761 | ||||
| 48762 | unsigned ExtOpc = LHS.getOpcode(); | |||
| 48763 | if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) || | |||
| 48764 | RHS.getOpcode() != ExtOpc) | |||
| 48765 | return SDValue(); | |||
| 48766 | ||||
| 48767 | // Peek through the extends. | |||
| 48768 | LHS = LHS.getOperand(0); | |||
| 48769 | RHS = RHS.getOperand(0); | |||
| 48770 | ||||
| 48771 | // Ensure the input types match. | |||
| 48772 | EVT MulVT = LHS.getValueType(); | |||
| 48773 | if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT) | |||
| 48774 | return SDValue(); | |||
| 48775 | ||||
| 48776 | unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU; | |||
| 48777 | SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS); | |||
| 48778 | ||||
| 48779 | ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 48780 | return DAG.getNode(ExtOpc, DL, VT, Mulh); | |||
| 48781 | } | |||
| 48782 | ||||
| 48783 | static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { | |||
| 48784 | SDValue N0 = N->getOperand(0); | |||
| 48785 | SDValue N1 = N->getOperand(1); | |||
| 48786 | ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1); | |||
| 48787 | EVT VT = N0.getValueType(); | |||
| 48788 | ||||
| 48789 | // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2)) | |||
| 48790 | // since the result of setcc_c is all zero's or all ones. | |||
| 48791 | if (VT.isInteger() && !VT.isVector() && | |||
| 48792 | N1C && N0.getOpcode() == ISD::AND && | |||
| 48793 | N0.getOperand(1).getOpcode() == ISD::Constant) { | |||
| 48794 | SDValue N00 = N0.getOperand(0); | |||
| 48795 | APInt Mask = N0.getConstantOperandAPInt(1); | |||
| 48796 | Mask <<= N1C->getAPIntValue(); | |||
| 48797 | bool MaskOK = false; | |||
| 48798 | // We can handle cases concerning bit-widening nodes containing setcc_c if | |||
| 48799 | // we carefully interrogate the mask to make sure we are semantics | |||
| 48800 | // preserving. | |||
| 48801 | // The transform is not safe if the result of C1 << C2 exceeds the bitwidth | |||
| 48802 | // of the underlying setcc_c operation if the setcc_c was zero extended. | |||
| 48803 | // Consider the following example: | |||
| 48804 | // zext(setcc_c) -> i32 0x0000FFFF | |||
| 48805 | // c1 -> i32 0x0000FFFF | |||
| 48806 | // c2 -> i32 0x00000001 | |||
| 48807 | // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE | |||
| 48808 | // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE | |||
| 48809 | if (N00.getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 48810 | MaskOK = true; | |||
| 48811 | } else if (N00.getOpcode() == ISD::SIGN_EXTEND && | |||
| 48812 | N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 48813 | MaskOK = true; | |||
| 48814 | } else if ((N00.getOpcode() == ISD::ZERO_EXTEND || | |||
| 48815 | N00.getOpcode() == ISD::ANY_EXTEND) && | |||
| 48816 | N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 48817 | MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits()); | |||
| 48818 | } | |||
| 48819 | if (MaskOK && Mask != 0) { | |||
| 48820 | SDLoc DL(N); | |||
| 48821 | return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT)); | |||
| 48822 | } | |||
| 48823 | } | |||
| 48824 | ||||
| 48825 | return SDValue(); | |||
| 48826 | } | |||
| 48827 | ||||
| 48828 | static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG, | |||
| 48829 | const X86Subtarget &Subtarget) { | |||
| 48830 | SDValue N0 = N->getOperand(0); | |||
| 48831 | SDValue N1 = N->getOperand(1); | |||
| 48832 | EVT VT = N0.getValueType(); | |||
| 48833 | unsigned Size = VT.getSizeInBits(); | |||
| 48834 | ||||
| 48835 | if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget)) | |||
| 48836 | return V; | |||
| 48837 | ||||
| 48838 | // fold (ashr (shl, a, [56,48,32,24,16]), SarConst) | |||
| 48839 | // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or | |||
| 48840 | // into (lshr, (sext (a), SarConst - [56,48,32,24,16])) | |||
| 48841 | // depending on sign of (SarConst - [56,48,32,24,16]) | |||
| 48842 | ||||
| 48843 | // sexts in X86 are MOVs. The MOVs have the same code size | |||
| 48844 | // as above SHIFTs (only SHIFT on 1 has lower code size). | |||
| 48845 | // However the MOVs have 2 advantages to a SHIFT: | |||
| 48846 | // 1. MOVs can write to a register that differs from source | |||
| 48847 | // 2. MOVs accept memory operands | |||
| 48848 | ||||
| 48849 | if (VT.isVector() || N1.getOpcode() != ISD::Constant || | |||
| 48850 | N0.getOpcode() != ISD::SHL || !N0.hasOneUse() || | |||
| 48851 | N0.getOperand(1).getOpcode() != ISD::Constant) | |||
| 48852 | return SDValue(); | |||
| 48853 | ||||
| 48854 | SDValue N00 = N0.getOperand(0); | |||
| 48855 | SDValue N01 = N0.getOperand(1); | |||
| 48856 | APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue(); | |||
| 48857 | APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue(); | |||
| 48858 | EVT CVT = N1.getValueType(); | |||
| 48859 | ||||
| 48860 | if (SarConst.isNegative()) | |||
| 48861 | return SDValue(); | |||
| 48862 | ||||
| 48863 | for (MVT SVT : { MVT::i8, MVT::i16, MVT::i32 }) { | |||
| 48864 | unsigned ShiftSize = SVT.getSizeInBits(); | |||
| 48865 | // skipping types without corresponding sext/zext and | |||
| 48866 | // ShlConst that is not one of [56,48,32,24,16] | |||
| 48867 | if (ShiftSize >= Size || ShlConst != Size - ShiftSize) | |||
| 48868 | continue; | |||
| 48869 | SDLoc DL(N); | |||
| 48870 | SDValue NN = | |||
| 48871 | DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT)); | |||
| 48872 | SarConst = SarConst - (Size - ShiftSize); | |||
| 48873 | if (SarConst == 0) | |||
| 48874 | return NN; | |||
| 48875 | if (SarConst.isNegative()) | |||
| 48876 | return DAG.getNode(ISD::SHL, DL, VT, NN, | |||
| 48877 | DAG.getConstant(-SarConst, DL, CVT)); | |||
| 48878 | return DAG.getNode(ISD::SRA, DL, VT, NN, | |||
| 48879 | DAG.getConstant(SarConst, DL, CVT)); | |||
| 48880 | } | |||
| 48881 | return SDValue(); | |||
| 48882 | } | |||
| 48883 | ||||
| 48884 | static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG, | |||
| 48885 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 48886 | const X86Subtarget &Subtarget) { | |||
| 48887 | SDValue N0 = N->getOperand(0); | |||
| 48888 | SDValue N1 = N->getOperand(1); | |||
| 48889 | EVT VT = N0.getValueType(); | |||
| 48890 | ||||
| 48891 | if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget)) | |||
| 48892 | return V; | |||
| 48893 | ||||
| 48894 | // Only do this on the last DAG combine as it can interfere with other | |||
| 48895 | // combines. | |||
| 48896 | if (!DCI.isAfterLegalizeDAG()) | |||
| 48897 | return SDValue(); | |||
| 48898 | ||||
| 48899 | // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. | |||
| 48900 | // TODO: This is a generic DAG combine that became an x86-only combine to | |||
| 48901 | // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and | |||
| 48902 | // and-not ('andn'). | |||
| 48903 | if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) | |||
| 48904 | return SDValue(); | |||
| 48905 | ||||
| 48906 | auto *ShiftC = dyn_cast<ConstantSDNode>(N1); | |||
| 48907 | auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1)); | |||
| 48908 | if (!ShiftC || !AndC) | |||
| 48909 | return SDValue(); | |||
| 48910 | ||||
| 48911 | // If we can shrink the constant mask below 8-bits or 32-bits, then this | |||
| 48912 | // transform should reduce code size. It may also enable secondary transforms | |||
| 48913 | // from improved known-bits analysis or instruction selection. | |||
| 48914 | APInt MaskVal = AndC->getAPIntValue(); | |||
| 48915 | ||||
| 48916 | // If this can be matched by a zero extend, don't optimize. | |||
| 48917 | if (MaskVal.isMask()) { | |||
| 48918 | unsigned TO = MaskVal.countr_one(); | |||
| 48919 | if (TO >= 8 && isPowerOf2_32(TO)) | |||
| 48920 | return SDValue(); | |||
| 48921 | } | |||
| 48922 | ||||
| 48923 | APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); | |||
| 48924 | unsigned OldMaskSize = MaskVal.getSignificantBits(); | |||
| 48925 | unsigned NewMaskSize = NewMaskVal.getSignificantBits(); | |||
| 48926 | if ((OldMaskSize > 8 && NewMaskSize <= 8) || | |||
| 48927 | (OldMaskSize > 32 && NewMaskSize <= 32)) { | |||
| 48928 | // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) | |||
| 48929 | SDLoc DL(N); | |||
| 48930 | SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); | |||
| 48931 | SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); | |||
| 48932 | return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); | |||
| 48933 | } | |||
| 48934 | return SDValue(); | |||
| 48935 | } | |||
| 48936 | ||||
| 48937 | static SDValue combineHorizOpWithShuffle(SDNode *N, SelectionDAG &DAG, | |||
| 48938 | const X86Subtarget &Subtarget) { | |||
| 48939 | unsigned Opcode = N->getOpcode(); | |||
| 48940 | assert(isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode")(static_cast <bool> (isHorizOp(Opcode) && "Unexpected hadd/hsub/pack opcode" ) ? void (0) : __assert_fail ("isHorizOp(Opcode) && \"Unexpected hadd/hsub/pack opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 48940, __extension__ __PRETTY_FUNCTION__)); | |||
| 48941 | ||||
| 48942 | SDLoc DL(N); | |||
| 48943 | EVT VT = N->getValueType(0); | |||
| 48944 | SDValue N0 = N->getOperand(0); | |||
| 48945 | SDValue N1 = N->getOperand(1); | |||
| 48946 | EVT SrcVT = N0.getValueType(); | |||
| 48947 | ||||
| 48948 | SDValue BC0 = | |||
| 48949 | N->isOnlyUserOf(N0.getNode()) ? peekThroughOneUseBitcasts(N0) : N0; | |||
| 48950 | SDValue BC1 = | |||
| 48951 | N->isOnlyUserOf(N1.getNode()) ? peekThroughOneUseBitcasts(N1) : N1; | |||
| 48952 | ||||
| 48953 | // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) | |||
| 48954 | // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for | |||
| 48955 | // truncation trees that help us avoid lane crossing shuffles. | |||
| 48956 | // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. | |||
| 48957 | // TODO: We don't handle vXf64 shuffles yet. | |||
| 48958 | if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { | |||
| 48959 | if (SDValue BCSrc = getSplitVectorSrc(BC0, BC1, false)) { | |||
| 48960 | SmallVector<SDValue> ShuffleOps; | |||
| 48961 | SmallVector<int> ShuffleMask, ScaledMask; | |||
| 48962 | SDValue Vec = peekThroughBitcasts(BCSrc); | |||
| 48963 | if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) { | |||
| 48964 | resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask); | |||
| 48965 | // To keep the HOP LHS/RHS coherency, we must be able to scale the unary | |||
| 48966 | // shuffle to a v4X64 width - we can probably relax this in the future. | |||
| 48967 | if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 && | |||
| 48968 | ShuffleOps[0].getValueType().is256BitVector() && | |||
| 48969 | scaleShuffleElements(ShuffleMask, 4, ScaledMask)) { | |||
| 48970 | SDValue Lo, Hi; | |||
| 48971 | MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; | |||
| 48972 | std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL); | |||
| 48973 | Lo = DAG.getBitcast(SrcVT, Lo); | |||
| 48974 | Hi = DAG.getBitcast(SrcVT, Hi); | |||
| 48975 | SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); | |||
| 48976 | Res = DAG.getBitcast(ShufVT, Res); | |||
| 48977 | Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask); | |||
| 48978 | return DAG.getBitcast(VT, Res); | |||
| 48979 | } | |||
| 48980 | } | |||
| 48981 | } | |||
| 48982 | } | |||
| 48983 | ||||
| 48984 | // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(Z,W)) -> SHUFFLE(HOP()). | |||
| 48985 | if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32) { | |||
| 48986 | // If either/both ops are a shuffle that can scale to v2x64, | |||
| 48987 | // then see if we can perform this as a v4x32 post shuffle. | |||
| 48988 | SmallVector<SDValue> Ops0, Ops1; | |||
| 48989 | SmallVector<int> Mask0, Mask1, ScaledMask0, ScaledMask1; | |||
| 48990 | bool IsShuf0 = | |||
| 48991 | getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && | |||
| 48992 | scaleShuffleElements(Mask0, 2, ScaledMask0) && | |||
| 48993 | all_of(Ops0, [](SDValue Op) { return Op.getValueSizeInBits() == 128; }); | |||
| 48994 | bool IsShuf1 = | |||
| 48995 | getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && | |||
| 48996 | scaleShuffleElements(Mask1, 2, ScaledMask1) && | |||
| 48997 | all_of(Ops1, [](SDValue Op) { return Op.getValueSizeInBits() == 128; }); | |||
| 48998 | if (IsShuf0 || IsShuf1) { | |||
| 48999 | if (!IsShuf0) { | |||
| 49000 | Ops0.assign({BC0}); | |||
| 49001 | ScaledMask0.assign({0, 1}); | |||
| 49002 | } | |||
| 49003 | if (!IsShuf1) { | |||
| 49004 | Ops1.assign({BC1}); | |||
| 49005 | ScaledMask1.assign({0, 1}); | |||
| 49006 | } | |||
| 49007 | ||||
| 49008 | SDValue LHS, RHS; | |||
| 49009 | int PostShuffle[4] = {-1, -1, -1, -1}; | |||
| 49010 | auto FindShuffleOpAndIdx = [&](int M, int &Idx, ArrayRef<SDValue> Ops) { | |||
| 49011 | if (M < 0) | |||
| 49012 | return true; | |||
| 49013 | Idx = M % 2; | |||
| 49014 | SDValue Src = Ops[M / 2]; | |||
| 49015 | if (!LHS || LHS == Src) { | |||
| 49016 | LHS = Src; | |||
| 49017 | return true; | |||
| 49018 | } | |||
| 49019 | if (!RHS || RHS == Src) { | |||
| 49020 | Idx += 2; | |||
| 49021 | RHS = Src; | |||
| 49022 | return true; | |||
| 49023 | } | |||
| 49024 | return false; | |||
| 49025 | }; | |||
| 49026 | if (FindShuffleOpAndIdx(ScaledMask0[0], PostShuffle[0], Ops0) && | |||
| 49027 | FindShuffleOpAndIdx(ScaledMask0[1], PostShuffle[1], Ops0) && | |||
| 49028 | FindShuffleOpAndIdx(ScaledMask1[0], PostShuffle[2], Ops1) && | |||
| 49029 | FindShuffleOpAndIdx(ScaledMask1[1], PostShuffle[3], Ops1)) { | |||
| 49030 | LHS = DAG.getBitcast(SrcVT, LHS); | |||
| 49031 | RHS = DAG.getBitcast(SrcVT, RHS ? RHS : LHS); | |||
| 49032 | MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; | |||
| 49033 | SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS); | |||
| 49034 | Res = DAG.getBitcast(ShufVT, Res); | |||
| 49035 | Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, PostShuffle); | |||
| 49036 | return DAG.getBitcast(VT, Res); | |||
| 49037 | } | |||
| 49038 | } | |||
| 49039 | } | |||
| 49040 | ||||
| 49041 | // Attempt to fold HOP(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(HOP(X,Y)). | |||
| 49042 | if (VT.is256BitVector() && Subtarget.hasInt256()) { | |||
| 49043 | SmallVector<int> Mask0, Mask1; | |||
| 49044 | SmallVector<SDValue> Ops0, Ops1; | |||
| 49045 | SmallVector<int, 2> ScaledMask0, ScaledMask1; | |||
| 49046 | if (getTargetShuffleInputs(BC0, Ops0, Mask0, DAG) && !isAnyZero(Mask0) && | |||
| 49047 | getTargetShuffleInputs(BC1, Ops1, Mask1, DAG) && !isAnyZero(Mask1) && | |||
| 49048 | !Ops0.empty() && !Ops1.empty() && | |||
| 49049 | all_of(Ops0, | |||
| 49050 | [](SDValue Op) { return Op.getValueType().is256BitVector(); }) && | |||
| 49051 | all_of(Ops1, | |||
| 49052 | [](SDValue Op) { return Op.getValueType().is256BitVector(); }) && | |||
| 49053 | scaleShuffleElements(Mask0, 2, ScaledMask0) && | |||
| 49054 | scaleShuffleElements(Mask1, 2, ScaledMask1)) { | |||
| 49055 | SDValue Op00 = peekThroughBitcasts(Ops0.front()); | |||
| 49056 | SDValue Op10 = peekThroughBitcasts(Ops1.front()); | |||
| 49057 | SDValue Op01 = peekThroughBitcasts(Ops0.back()); | |||
| 49058 | SDValue Op11 = peekThroughBitcasts(Ops1.back()); | |||
| 49059 | if ((Op00 == Op11) && (Op01 == Op10)) { | |||
| 49060 | std::swap(Op10, Op11); | |||
| 49061 | ShuffleVectorSDNode::commuteMask(ScaledMask1); | |||
| 49062 | } | |||
| 49063 | if ((Op00 == Op10) && (Op01 == Op11)) { | |||
| 49064 | const int Map[4] = {0, 2, 1, 3}; | |||
| 49065 | SmallVector<int, 4> ShuffleMask( | |||
| 49066 | {Map[ScaledMask0[0]], Map[ScaledMask1[0]], Map[ScaledMask0[1]], | |||
| 49067 | Map[ScaledMask1[1]]}); | |||
| 49068 | MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; | |||
| 49069 | SDValue Res = DAG.getNode(Opcode, DL, VT, DAG.getBitcast(SrcVT, Op00), | |||
| 49070 | DAG.getBitcast(SrcVT, Op01)); | |||
| 49071 | Res = DAG.getBitcast(ShufVT, Res); | |||
| 49072 | Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); | |||
| 49073 | return DAG.getBitcast(VT, Res); | |||
| 49074 | } | |||
| 49075 | } | |||
| 49076 | } | |||
| 49077 | ||||
| 49078 | return SDValue(); | |||
| 49079 | } | |||
| 49080 | ||||
| 49081 | static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, | |||
| 49082 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49083 | const X86Subtarget &Subtarget) { | |||
| 49084 | unsigned Opcode = N->getOpcode(); | |||
| 49085 | assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD ::PACKUS == Opcode) && "Unexpected pack opcode") ? void (0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__ __PRETTY_FUNCTION__)) | |||
| 49086 | "Unexpected pack opcode")(static_cast <bool> ((X86ISD::PACKSS == Opcode || X86ISD ::PACKUS == Opcode) && "Unexpected pack opcode") ? void (0) : __assert_fail ("(X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && \"Unexpected pack opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49086, __extension__ __PRETTY_FUNCTION__)); | |||
| 49087 | ||||
| 49088 | EVT VT = N->getValueType(0); | |||
| 49089 | SDValue N0 = N->getOperand(0); | |||
| 49090 | SDValue N1 = N->getOperand(1); | |||
| 49091 | unsigned NumDstElts = VT.getVectorNumElements(); | |||
| 49092 | unsigned DstBitsPerElt = VT.getScalarSizeInBits(); | |||
| 49093 | unsigned SrcBitsPerElt = 2 * DstBitsPerElt; | |||
| 49094 | assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__ __PRETTY_FUNCTION__)) | |||
| 49095 | N1.getScalarValueSizeInBits() == SrcBitsPerElt &&(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__ __PRETTY_FUNCTION__)) | |||
| 49096 | "Unexpected PACKSS/PACKUS input type")(static_cast <bool> (N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && "Unexpected PACKSS/PACKUS input type") ? void (0) : __assert_fail ("N0.getScalarValueSizeInBits() == SrcBitsPerElt && N1.getScalarValueSizeInBits() == SrcBitsPerElt && \"Unexpected PACKSS/PACKUS input type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49096, __extension__ __PRETTY_FUNCTION__)); | |||
| 49097 | ||||
| 49098 | bool IsSigned = (X86ISD::PACKSS == Opcode); | |||
| 49099 | ||||
| 49100 | // Constant Folding. | |||
| 49101 | APInt UndefElts0, UndefElts1; | |||
| 49102 | SmallVector<APInt, 32> EltBits0, EltBits1; | |||
| 49103 | if ((N0.isUndef() || N->isOnlyUserOf(N0.getNode())) && | |||
| 49104 | (N1.isUndef() || N->isOnlyUserOf(N1.getNode())) && | |||
| 49105 | getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && | |||
| 49106 | getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { | |||
| 49107 | unsigned NumLanes = VT.getSizeInBits() / 128; | |||
| 49108 | unsigned NumSrcElts = NumDstElts / 2; | |||
| 49109 | unsigned NumDstEltsPerLane = NumDstElts / NumLanes; | |||
| 49110 | unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; | |||
| 49111 | ||||
| 49112 | APInt Undefs(NumDstElts, 0); | |||
| 49113 | SmallVector<APInt, 32> Bits(NumDstElts, APInt::getZero(DstBitsPerElt)); | |||
| 49114 | for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { | |||
| 49115 | for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { | |||
| 49116 | unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; | |||
| 49117 | auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); | |||
| 49118 | auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); | |||
| 49119 | ||||
| 49120 | if (UndefElts[SrcIdx]) { | |||
| 49121 | Undefs.setBit(Lane * NumDstEltsPerLane + Elt); | |||
| 49122 | continue; | |||
| 49123 | } | |||
| 49124 | ||||
| 49125 | APInt &Val = EltBits[SrcIdx]; | |||
| 49126 | if (IsSigned) { | |||
| 49127 | // PACKSS: Truncate signed value with signed saturation. | |||
| 49128 | // Source values less than dst minint are saturated to minint. | |||
| 49129 | // Source values greater than dst maxint are saturated to maxint. | |||
| 49130 | if (Val.isSignedIntN(DstBitsPerElt)) | |||
| 49131 | Val = Val.trunc(DstBitsPerElt); | |||
| 49132 | else if (Val.isNegative()) | |||
| 49133 | Val = APInt::getSignedMinValue(DstBitsPerElt); | |||
| 49134 | else | |||
| 49135 | Val = APInt::getSignedMaxValue(DstBitsPerElt); | |||
| 49136 | } else { | |||
| 49137 | // PACKUS: Truncate signed value with unsigned saturation. | |||
| 49138 | // Source values less than zero are saturated to zero. | |||
| 49139 | // Source values greater than dst maxuint are saturated to maxuint. | |||
| 49140 | if (Val.isIntN(DstBitsPerElt)) | |||
| 49141 | Val = Val.trunc(DstBitsPerElt); | |||
| 49142 | else if (Val.isNegative()) | |||
| 49143 | Val = APInt::getZero(DstBitsPerElt); | |||
| 49144 | else | |||
| 49145 | Val = APInt::getAllOnes(DstBitsPerElt); | |||
| 49146 | } | |||
| 49147 | Bits[Lane * NumDstEltsPerLane + Elt] = Val; | |||
| 49148 | } | |||
| 49149 | } | |||
| 49150 | ||||
| 49151 | return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); | |||
| 49152 | } | |||
| 49153 | ||||
| 49154 | // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()). | |||
| 49155 | if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) | |||
| 49156 | return V; | |||
| 49157 | ||||
| 49158 | // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular | |||
| 49159 | // truncate to create a larger truncate. | |||
| 49160 | if (Subtarget.hasAVX512() && | |||
| 49161 | N0.getOpcode() == ISD::TRUNCATE && N1.isUndef() && VT == MVT::v16i8 && | |||
| 49162 | N0.getOperand(0).getValueType() == MVT::v8i32) { | |||
| 49163 | if ((IsSigned && DAG.ComputeNumSignBits(N0) > 8) || | |||
| 49164 | (!IsSigned && | |||
| 49165 | DAG.MaskedValueIsZero(N0, APInt::getHighBitsSet(16, 8)))) { | |||
| 49166 | if (Subtarget.hasVLX()) | |||
| 49167 | return DAG.getNode(X86ISD::VTRUNC, SDLoc(N), VT, N0.getOperand(0)); | |||
| 49168 | ||||
| 49169 | // Widen input to v16i32 so we can truncate that. | |||
| 49170 | SDLoc dl(N); | |||
| 49171 | SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v16i32, | |||
| 49172 | N0.getOperand(0), DAG.getUNDEF(MVT::v8i32)); | |||
| 49173 | return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Concat); | |||
| 49174 | } | |||
| 49175 | } | |||
| 49176 | ||||
| 49177 | // Try to fold PACK(EXTEND(X),EXTEND(Y)) -> CONCAT(X,Y) subvectors. | |||
| 49178 | if (VT.is128BitVector()) { | |||
| 49179 | unsigned ExtOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND; | |||
| 49180 | SDValue Src0, Src1; | |||
| 49181 | if (N0.getOpcode() == ExtOpc && | |||
| 49182 | N0.getOperand(0).getValueType().is64BitVector() && | |||
| 49183 | N0.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { | |||
| 49184 | Src0 = N0.getOperand(0); | |||
| 49185 | } | |||
| 49186 | if (N1.getOpcode() == ExtOpc && | |||
| 49187 | N1.getOperand(0).getValueType().is64BitVector() && | |||
| 49188 | N1.getOperand(0).getScalarValueSizeInBits() == DstBitsPerElt) { | |||
| 49189 | Src1 = N1.getOperand(0); | |||
| 49190 | } | |||
| 49191 | if ((Src0 || N0.isUndef()) && (Src1 || N1.isUndef())) { | |||
| 49192 | assert((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)")(static_cast <bool> ((Src0 || Src1) && "Found PACK(UNDEF,UNDEF)" ) ? void (0) : __assert_fail ("(Src0 || Src1) && \"Found PACK(UNDEF,UNDEF)\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49192, __extension__ __PRETTY_FUNCTION__)); | |||
| 49193 | Src0 = Src0 ? Src0 : DAG.getUNDEF(Src1.getValueType()); | |||
| 49194 | Src1 = Src1 ? Src1 : DAG.getUNDEF(Src0.getValueType()); | |||
| 49195 | return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Src0, Src1); | |||
| 49196 | } | |||
| 49197 | ||||
| 49198 | // Try again with pack(*_extend_vector_inreg, undef). | |||
| 49199 | unsigned VecInRegOpc = IsSigned ? ISD::SIGN_EXTEND_VECTOR_INREG | |||
| 49200 | : ISD::ZERO_EXTEND_VECTOR_INREG; | |||
| 49201 | if (N0.getOpcode() == VecInRegOpc && N1.isUndef() && | |||
| 49202 | N0.getOperand(0).getScalarValueSizeInBits() < DstBitsPerElt) | |||
| 49203 | return getEXTEND_VECTOR_INREG(ExtOpc, SDLoc(N), VT, N0.getOperand(0), | |||
| 49204 | DAG); | |||
| 49205 | } | |||
| 49206 | ||||
| 49207 | // Attempt to combine as shuffle. | |||
| 49208 | SDValue Op(N, 0); | |||
| 49209 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 49210 | return Res; | |||
| 49211 | ||||
| 49212 | return SDValue(); | |||
| 49213 | } | |||
| 49214 | ||||
| 49215 | static SDValue combineVectorHADDSUB(SDNode *N, SelectionDAG &DAG, | |||
| 49216 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49217 | const X86Subtarget &Subtarget) { | |||
| 49218 | assert((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() ||(static_cast <bool> ((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N-> getOpcode() || X86ISD::FHSUB == N->getOpcode()) && "Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__ __PRETTY_FUNCTION__)) | |||
| 49219 | X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N-> getOpcode() || X86ISD::FHSUB == N->getOpcode()) && "Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__ __PRETTY_FUNCTION__)) | |||
| 49220 | "Unexpected horizontal add/sub opcode")(static_cast <bool> ((X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N-> getOpcode() || X86ISD::FHSUB == N->getOpcode()) && "Unexpected horizontal add/sub opcode") ? void (0) : __assert_fail ("(X86ISD::HADD == N->getOpcode() || X86ISD::FHADD == N->getOpcode() || X86ISD::HSUB == N->getOpcode() || X86ISD::FHSUB == N->getOpcode()) && \"Unexpected horizontal add/sub opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49220, __extension__ __PRETTY_FUNCTION__)); | |||
| 49221 | ||||
| 49222 | if (!shouldUseHorizontalOp(true, DAG, Subtarget)) { | |||
| 49223 | MVT VT = N->getSimpleValueType(0); | |||
| 49224 | SDValue LHS = N->getOperand(0); | |||
| 49225 | SDValue RHS = N->getOperand(1); | |||
| 49226 | ||||
| 49227 | // HOP(HOP'(X,X),HOP'(Y,Y)) -> HOP(PERMUTE(HOP'(X,Y)),PERMUTE(HOP'(X,Y)). | |||
| 49228 | if (LHS != RHS && LHS.getOpcode() == N->getOpcode() && | |||
| 49229 | LHS.getOpcode() == RHS.getOpcode() && | |||
| 49230 | LHS.getValueType() == RHS.getValueType() && | |||
| 49231 | N->isOnlyUserOf(LHS.getNode()) && N->isOnlyUserOf(RHS.getNode())) { | |||
| 49232 | SDValue LHS0 = LHS.getOperand(0); | |||
| 49233 | SDValue LHS1 = LHS.getOperand(1); | |||
| 49234 | SDValue RHS0 = RHS.getOperand(0); | |||
| 49235 | SDValue RHS1 = RHS.getOperand(1); | |||
| 49236 | if ((LHS0 == LHS1 || LHS0.isUndef() || LHS1.isUndef()) && | |||
| 49237 | (RHS0 == RHS1 || RHS0.isUndef() || RHS1.isUndef())) { | |||
| 49238 | SDLoc DL(N); | |||
| 49239 | SDValue Res = DAG.getNode(LHS.getOpcode(), DL, LHS.getValueType(), | |||
| 49240 | LHS0.isUndef() ? LHS1 : LHS0, | |||
| 49241 | RHS0.isUndef() ? RHS1 : RHS0); | |||
| 49242 | MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32); | |||
| 49243 | Res = DAG.getBitcast(ShufVT, Res); | |||
| 49244 | SDValue NewLHS = | |||
| 49245 | DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res, | |||
| 49246 | getV4X86ShuffleImm8ForMask({0, 1, 0, 1}, DL, DAG)); | |||
| 49247 | SDValue NewRHS = | |||
| 49248 | DAG.getNode(X86ISD::PSHUFD, DL, ShufVT, Res, | |||
| 49249 | getV4X86ShuffleImm8ForMask({2, 3, 2, 3}, DL, DAG)); | |||
| 49250 | return DAG.getNode(N->getOpcode(), DL, VT, DAG.getBitcast(VT, NewLHS), | |||
| 49251 | DAG.getBitcast(VT, NewRHS)); | |||
| 49252 | } | |||
| 49253 | } | |||
| 49254 | } | |||
| 49255 | ||||
| 49256 | // Try to fold HOP(SHUFFLE(),SHUFFLE()) -> SHUFFLE(HOP()). | |||
| 49257 | if (SDValue V = combineHorizOpWithShuffle(N, DAG, Subtarget)) | |||
| 49258 | return V; | |||
| 49259 | ||||
| 49260 | return SDValue(); | |||
| 49261 | } | |||
| 49262 | ||||
| 49263 | static SDValue combineVectorShiftVar(SDNode *N, SelectionDAG &DAG, | |||
| 49264 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49265 | const X86Subtarget &Subtarget) { | |||
| 49266 | assert((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() ||(static_cast <bool> ((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N-> getOpcode()) && "Unexpected shift opcode") ? void (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__ __PRETTY_FUNCTION__)) | |||
| 49267 | X86ISD::VSRL == N->getOpcode()) &&(static_cast <bool> ((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N-> getOpcode()) && "Unexpected shift opcode") ? void (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__ __PRETTY_FUNCTION__)) | |||
| 49268 | "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N-> getOpcode()) && "Unexpected shift opcode") ? void (0) : __assert_fail ("(X86ISD::VSHL == N->getOpcode() || X86ISD::VSRA == N->getOpcode() || X86ISD::VSRL == N->getOpcode()) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49268, __extension__ __PRETTY_FUNCTION__)); | |||
| 49269 | EVT VT = N->getValueType(0); | |||
| 49270 | SDValue N0 = N->getOperand(0); | |||
| 49271 | SDValue N1 = N->getOperand(1); | |||
| 49272 | ||||
| 49273 | // Shift zero -> zero. | |||
| 49274 | if (ISD::isBuildVectorAllZeros(N0.getNode())) | |||
| 49275 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 49276 | ||||
| 49277 | // Detect constant shift amounts. | |||
| 49278 | APInt UndefElts; | |||
| 49279 | SmallVector<APInt, 32> EltBits; | |||
| 49280 | if (getTargetConstantBitsFromNode(N1, 64, UndefElts, EltBits, true, false)) { | |||
| 49281 | unsigned X86Opc = getTargetVShiftUniformOpcode(N->getOpcode(), false); | |||
| 49282 | return getTargetVShiftByConstNode(X86Opc, SDLoc(N), VT.getSimpleVT(), N0, | |||
| 49283 | EltBits[0].getZExtValue(), DAG); | |||
| 49284 | } | |||
| 49285 | ||||
| 49286 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 49287 | APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); | |||
| 49288 | if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) | |||
| 49289 | return SDValue(N, 0); | |||
| 49290 | ||||
| 49291 | return SDValue(); | |||
| 49292 | } | |||
| 49293 | ||||
| 49294 | static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, | |||
| 49295 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49296 | const X86Subtarget &Subtarget) { | |||
| 49297 | unsigned Opcode = N->getOpcode(); | |||
| 49298 | assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD ::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode" ) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__ __PRETTY_FUNCTION__)) | |||
| 49299 | X86ISD::VSRLI == Opcode) &&(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD ::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode" ) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__ __PRETTY_FUNCTION__)) | |||
| 49300 | "Unexpected shift opcode")(static_cast <bool> ((X86ISD::VSHLI == Opcode || X86ISD ::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && "Unexpected shift opcode" ) ? void (0) : __assert_fail ("(X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode || X86ISD::VSRLI == Opcode) && \"Unexpected shift opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49300, __extension__ __PRETTY_FUNCTION__)); | |||
| 49301 | bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode; | |||
| 49302 | EVT VT = N->getValueType(0); | |||
| 49303 | SDValue N0 = N->getOperand(0); | |||
| 49304 | SDValue N1 = N->getOperand(1); | |||
| 49305 | unsigned NumBitsPerElt = VT.getScalarSizeInBits(); | |||
| 49306 | assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&(static_cast <bool> (VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type") ? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__ __PRETTY_FUNCTION__)) | |||
| 49307 | "Unexpected value type")(static_cast <bool> (VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && "Unexpected value type") ? void (0) : __assert_fail ("VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 && \"Unexpected value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49307, __extension__ __PRETTY_FUNCTION__)); | |||
| 49308 | assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type")(static_cast <bool> (N1.getValueType() == MVT::i8 && "Unexpected shift amount type") ? void (0) : __assert_fail ( "N1.getValueType() == MVT::i8 && \"Unexpected shift amount type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49308, __extension__ __PRETTY_FUNCTION__)); | |||
| 49309 | ||||
| 49310 | // (shift undef, X) -> 0 | |||
| 49311 | if (N0.isUndef()) | |||
| 49312 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 49313 | ||||
| 49314 | // Out of range logical bit shifts are guaranteed to be zero. | |||
| 49315 | // Out of range arithmetic bit shifts splat the sign bit. | |||
| 49316 | unsigned ShiftVal = N->getConstantOperandVal(1); | |||
| 49317 | if (ShiftVal >= NumBitsPerElt) { | |||
| 49318 | if (LogicalShift) | |||
| 49319 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 49320 | ShiftVal = NumBitsPerElt - 1; | |||
| 49321 | } | |||
| 49322 | ||||
| 49323 | // (shift X, 0) -> X | |||
| 49324 | if (!ShiftVal) | |||
| 49325 | return N0; | |||
| 49326 | ||||
| 49327 | // (shift 0, C) -> 0 | |||
| 49328 | if (ISD::isBuildVectorAllZeros(N0.getNode())) | |||
| 49329 | // N0 is all zeros or undef. We guarantee that the bits shifted into the | |||
| 49330 | // result are all zeros, not undef. | |||
| 49331 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 49332 | ||||
| 49333 | // (VSRAI -1, C) -> -1 | |||
| 49334 | if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode())) | |||
| 49335 | // N0 is all ones or undef. We guarantee that the bits shifted into the | |||
| 49336 | // result are all ones, not undef. | |||
| 49337 | return DAG.getConstant(-1, SDLoc(N), VT); | |||
| 49338 | ||||
| 49339 | auto MergeShifts = [&](SDValue X, uint64_t Amt0, uint64_t Amt1) { | |||
| 49340 | unsigned NewShiftVal = Amt0 + Amt1; | |||
| 49341 | if (NewShiftVal >= NumBitsPerElt) { | |||
| 49342 | // Out of range logical bit shifts are guaranteed to be zero. | |||
| 49343 | // Out of range arithmetic bit shifts splat the sign bit. | |||
| 49344 | if (LogicalShift) | |||
| 49345 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 49346 | NewShiftVal = NumBitsPerElt - 1; | |||
| 49347 | } | |||
| 49348 | return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0), | |||
| 49349 | DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8)); | |||
| 49350 | }; | |||
| 49351 | ||||
| 49352 | // (shift (shift X, C2), C1) -> (shift X, (C1 + C2)) | |||
| 49353 | if (Opcode == N0.getOpcode()) | |||
| 49354 | return MergeShifts(N0.getOperand(0), ShiftVal, N0.getConstantOperandVal(1)); | |||
| 49355 | ||||
| 49356 | // (shl (add X, X), C) -> (shl X, (C + 1)) | |||
| 49357 | if (Opcode == X86ISD::VSHLI && N0.getOpcode() == ISD::ADD && | |||
| 49358 | N0.getOperand(0) == N0.getOperand(1)) | |||
| 49359 | return MergeShifts(N0.getOperand(0), ShiftVal, 1); | |||
| 49360 | ||||
| 49361 | // We can decode 'whole byte' logical bit shifts as shuffles. | |||
| 49362 | if (LogicalShift && (ShiftVal % 8) == 0) { | |||
| 49363 | SDValue Op(N, 0); | |||
| 49364 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 49365 | return Res; | |||
| 49366 | } | |||
| 49367 | ||||
| 49368 | auto TryConstantFold = [&](SDValue V) { | |||
| 49369 | APInt UndefElts; | |||
| 49370 | SmallVector<APInt, 32> EltBits; | |||
| 49371 | if (!getTargetConstantBitsFromNode(V, NumBitsPerElt, UndefElts, EltBits)) | |||
| 49372 | return SDValue(); | |||
| 49373 | assert(EltBits.size() == VT.getVectorNumElements() &&(static_cast <bool> (EltBits.size() == VT.getVectorNumElements () && "Unexpected shift value type") ? void (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__ __PRETTY_FUNCTION__)) | |||
| 49374 | "Unexpected shift value type")(static_cast <bool> (EltBits.size() == VT.getVectorNumElements () && "Unexpected shift value type") ? void (0) : __assert_fail ("EltBits.size() == VT.getVectorNumElements() && \"Unexpected shift value type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49374, __extension__ __PRETTY_FUNCTION__)); | |||
| 49375 | // Undef elements need to fold to 0. It's possible SimplifyDemandedBits | |||
| 49376 | // created an undef input due to no input bits being demanded, but user | |||
| 49377 | // still expects 0 in other bits. | |||
| 49378 | for (unsigned i = 0, e = EltBits.size(); i != e; ++i) { | |||
| 49379 | APInt &Elt = EltBits[i]; | |||
| 49380 | if (UndefElts[i]) | |||
| 49381 | Elt = 0; | |||
| 49382 | else if (X86ISD::VSHLI == Opcode) | |||
| 49383 | Elt <<= ShiftVal; | |||
| 49384 | else if (X86ISD::VSRAI == Opcode) | |||
| 49385 | Elt.ashrInPlace(ShiftVal); | |||
| 49386 | else | |||
| 49387 | Elt.lshrInPlace(ShiftVal); | |||
| 49388 | } | |||
| 49389 | // Reset undef elements since they were zeroed above. | |||
| 49390 | UndefElts = 0; | |||
| 49391 | return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N)); | |||
| 49392 | }; | |||
| 49393 | ||||
| 49394 | // Constant Folding. | |||
| 49395 | if (N->isOnlyUserOf(N0.getNode())) { | |||
| 49396 | if (SDValue C = TryConstantFold(N0)) | |||
| 49397 | return C; | |||
| 49398 | ||||
| 49399 | // Fold (shift (logic X, C2), C1) -> (logic (shift X, C1), (shift C2, C1)) | |||
| 49400 | // Don't break NOT patterns. | |||
| 49401 | SDValue BC = peekThroughOneUseBitcasts(N0); | |||
| 49402 | if (ISD::isBitwiseLogicOp(BC.getOpcode()) && | |||
| 49403 | BC->isOnlyUserOf(BC.getOperand(1).getNode()) && | |||
| 49404 | !ISD::isBuildVectorAllOnes(BC.getOperand(1).getNode())) { | |||
| 49405 | if (SDValue RHS = TryConstantFold(BC.getOperand(1))) { | |||
| 49406 | SDLoc DL(N); | |||
| 49407 | SDValue LHS = DAG.getNode(Opcode, DL, VT, | |||
| 49408 | DAG.getBitcast(VT, BC.getOperand(0)), N1); | |||
| 49409 | return DAG.getNode(BC.getOpcode(), DL, VT, LHS, RHS); | |||
| 49410 | } | |||
| 49411 | } | |||
| 49412 | } | |||
| 49413 | ||||
| 49414 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 49415 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBitsPerElt), | |||
| 49416 | DCI)) | |||
| 49417 | return SDValue(N, 0); | |||
| 49418 | ||||
| 49419 | return SDValue(); | |||
| 49420 | } | |||
| 49421 | ||||
| 49422 | static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, | |||
| 49423 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49424 | const X86Subtarget &Subtarget) { | |||
| 49425 | EVT VT = N->getValueType(0); | |||
| 49426 | unsigned Opcode = N->getOpcode(); | |||
| 49427 | assert(((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__ __PRETTY_FUNCTION__)) | |||
| 49428 | (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) ||(static_cast <bool> (((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__ __PRETTY_FUNCTION__)) | |||
| 49429 | Opcode == ISD::INSERT_VECTOR_ELT) &&(static_cast <bool> (((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__ __PRETTY_FUNCTION__)) | |||
| 49430 | "Unexpected vector insertion")(static_cast <bool> (((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && "Unexpected vector insertion") ? void (0) : __assert_fail ("((Opcode == X86ISD::PINSRB && VT == MVT::v16i8) || (Opcode == X86ISD::PINSRW && VT == MVT::v8i16) || Opcode == ISD::INSERT_VECTOR_ELT) && \"Unexpected vector insertion\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49430, __extension__ __PRETTY_FUNCTION__)); | |||
| 49431 | ||||
| 49432 | SDValue Vec = N->getOperand(0); | |||
| 49433 | SDValue Scl = N->getOperand(1); | |||
| 49434 | SDValue Idx = N->getOperand(2); | |||
| 49435 | ||||
| 49436 | // Fold insert_vector_elt(undef, elt, 0) --> scalar_to_vector(elt). | |||
| 49437 | if (Opcode == ISD::INSERT_VECTOR_ELT && Vec.isUndef() && isNullConstant(Idx)) | |||
| 49438 | return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), VT, Scl); | |||
| 49439 | ||||
| 49440 | if (Opcode == X86ISD::PINSRB || Opcode == X86ISD::PINSRW) { | |||
| 49441 | unsigned NumBitsPerElt = VT.getScalarSizeInBits(); | |||
| 49442 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 49443 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), | |||
| 49444 | APInt::getAllOnes(NumBitsPerElt), DCI)) | |||
| 49445 | return SDValue(N, 0); | |||
| 49446 | } | |||
| 49447 | ||||
| 49448 | // Attempt to combine insertion patterns to a shuffle. | |||
| 49449 | if (VT.isSimple() && DCI.isAfterLegalizeDAG()) { | |||
| 49450 | SDValue Op(N, 0); | |||
| 49451 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 49452 | return Res; | |||
| 49453 | } | |||
| 49454 | ||||
| 49455 | return SDValue(); | |||
| 49456 | } | |||
| 49457 | ||||
| 49458 | /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs | |||
| 49459 | /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for | |||
| 49460 | /// OR -> CMPNEQSS. | |||
| 49461 | static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, | |||
| 49462 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49463 | const X86Subtarget &Subtarget) { | |||
| 49464 | unsigned opcode; | |||
| 49465 | ||||
| 49466 | // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but | |||
| 49467 | // we're requiring SSE2 for both. | |||
| 49468 | if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) { | |||
| 49469 | SDValue N0 = N->getOperand(0); | |||
| 49470 | SDValue N1 = N->getOperand(1); | |||
| 49471 | SDValue CMP0 = N0.getOperand(1); | |||
| 49472 | SDValue CMP1 = N1.getOperand(1); | |||
| 49473 | SDLoc DL(N); | |||
| 49474 | ||||
| 49475 | // The SETCCs should both refer to the same CMP. | |||
| 49476 | if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1) | |||
| 49477 | return SDValue(); | |||
| 49478 | ||||
| 49479 | SDValue CMP00 = CMP0->getOperand(0); | |||
| 49480 | SDValue CMP01 = CMP0->getOperand(1); | |||
| 49481 | EVT VT = CMP00.getValueType(); | |||
| 49482 | ||||
| 49483 | if (VT == MVT::f32 || VT == MVT::f64 || | |||
| 49484 | (VT == MVT::f16 && Subtarget.hasFP16())) { | |||
| 49485 | bool ExpectingFlags = false; | |||
| 49486 | // Check for any users that want flags: | |||
| 49487 | for (const SDNode *U : N->uses()) { | |||
| 49488 | if (ExpectingFlags) | |||
| 49489 | break; | |||
| 49490 | ||||
| 49491 | switch (U->getOpcode()) { | |||
| 49492 | default: | |||
| 49493 | case ISD::BR_CC: | |||
| 49494 | case ISD::BRCOND: | |||
| 49495 | case ISD::SELECT: | |||
| 49496 | ExpectingFlags = true; | |||
| 49497 | break; | |||
| 49498 | case ISD::CopyToReg: | |||
| 49499 | case ISD::SIGN_EXTEND: | |||
| 49500 | case ISD::ZERO_EXTEND: | |||
| 49501 | case ISD::ANY_EXTEND: | |||
| 49502 | break; | |||
| 49503 | } | |||
| 49504 | } | |||
| 49505 | ||||
| 49506 | if (!ExpectingFlags) { | |||
| 49507 | enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0); | |||
| 49508 | enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0); | |||
| 49509 | ||||
| 49510 | if (cc1 == X86::COND_E || cc1 == X86::COND_NE) { | |||
| 49511 | X86::CondCode tmp = cc0; | |||
| 49512 | cc0 = cc1; | |||
| 49513 | cc1 = tmp; | |||
| 49514 | } | |||
| 49515 | ||||
| 49516 | if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) || | |||
| 49517 | (cc0 == X86::COND_NE && cc1 == X86::COND_P)) { | |||
| 49518 | // FIXME: need symbolic constants for these magic numbers. | |||
| 49519 | // See X86ATTInstPrinter.cpp:printSSECC(). | |||
| 49520 | unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4; | |||
| 49521 | if (Subtarget.hasAVX512()) { | |||
| 49522 | SDValue FSetCC = | |||
| 49523 | DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, | |||
| 49524 | DAG.getTargetConstant(x86cc, DL, MVT::i8)); | |||
| 49525 | // Need to fill with zeros to ensure the bitcast will produce zeroes | |||
| 49526 | // for the upper bits. An EXTRACT_ELEMENT here wouldn't guarantee that. | |||
| 49527 | SDValue Ins = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v16i1, | |||
| 49528 | DAG.getConstant(0, DL, MVT::v16i1), | |||
| 49529 | FSetCC, DAG.getIntPtrConstant(0, DL)); | |||
| 49530 | return DAG.getZExtOrTrunc(DAG.getBitcast(MVT::i16, Ins), DL, | |||
| 49531 | N->getSimpleValueType(0)); | |||
| 49532 | } | |||
| 49533 | SDValue OnesOrZeroesF = | |||
| 49534 | DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, | |||
| 49535 | CMP01, DAG.getTargetConstant(x86cc, DL, MVT::i8)); | |||
| 49536 | ||||
| 49537 | bool is64BitFP = (CMP00.getValueType() == MVT::f64); | |||
| 49538 | MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32; | |||
| 49539 | ||||
| 49540 | if (is64BitFP && !Subtarget.is64Bit()) { | |||
| 49541 | // On a 32-bit target, we cannot bitcast the 64-bit float to a | |||
| 49542 | // 64-bit integer, since that's not a legal type. Since | |||
| 49543 | // OnesOrZeroesF is all ones or all zeroes, we don't need all the | |||
| 49544 | // bits, but can do this little dance to extract the lowest 32 bits | |||
| 49545 | // and work with those going forward. | |||
| 49546 | SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, | |||
| 49547 | OnesOrZeroesF); | |||
| 49548 | SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64); | |||
| 49549 | OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, | |||
| 49550 | Vector32, DAG.getIntPtrConstant(0, DL)); | |||
| 49551 | IntVT = MVT::i32; | |||
| 49552 | } | |||
| 49553 | ||||
| 49554 | SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF); | |||
| 49555 | SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI, | |||
| 49556 | DAG.getConstant(1, DL, IntVT)); | |||
| 49557 | SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, | |||
| 49558 | ANDed); | |||
| 49559 | return OneBitOfTruth; | |||
| 49560 | } | |||
| 49561 | } | |||
| 49562 | } | |||
| 49563 | } | |||
| 49564 | return SDValue(); | |||
| 49565 | } | |||
| 49566 | ||||
| 49567 | /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y). | |||
| 49568 | static SDValue combineAndNotIntoANDNP(SDNode *N, SelectionDAG &DAG) { | |||
| 49569 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49569, __extension__ __PRETTY_FUNCTION__)); | |||
| 49570 | ||||
| 49571 | MVT VT = N->getSimpleValueType(0); | |||
| 49572 | if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector()) | |||
| 49573 | return SDValue(); | |||
| 49574 | ||||
| 49575 | SDValue X, Y; | |||
| 49576 | SDValue N0 = N->getOperand(0); | |||
| 49577 | SDValue N1 = N->getOperand(1); | |||
| 49578 | ||||
| 49579 | if (SDValue Not = IsNOT(N0, DAG)) { | |||
| 49580 | X = Not; | |||
| 49581 | Y = N1; | |||
| 49582 | } else if (SDValue Not = IsNOT(N1, DAG)) { | |||
| 49583 | X = Not; | |||
| 49584 | Y = N0; | |||
| 49585 | } else | |||
| 49586 | return SDValue(); | |||
| 49587 | ||||
| 49588 | X = DAG.getBitcast(VT, X); | |||
| 49589 | Y = DAG.getBitcast(VT, Y); | |||
| 49590 | return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y); | |||
| 49591 | } | |||
| 49592 | ||||
| 49593 | /// Try to fold: | |||
| 49594 | /// and (vector_shuffle<Z,...,Z> | |||
| 49595 | /// (insert_vector_elt undef, (xor X, -1), Z), undef), Y | |||
| 49596 | /// -> | |||
| 49597 | /// andnp (vector_shuffle<Z,...,Z> | |||
| 49598 | /// (insert_vector_elt undef, X, Z), undef), Y | |||
| 49599 | static SDValue combineAndShuffleNot(SDNode *N, SelectionDAG &DAG, | |||
| 49600 | const X86Subtarget &Subtarget) { | |||
| 49601 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP")(static_cast <bool> (N->getOpcode() == ISD::AND && "Unexpected opcode combine into ANDNP") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode combine into ANDNP\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49601, __extension__ __PRETTY_FUNCTION__)); | |||
| 49602 | ||||
| 49603 | EVT VT = N->getValueType(0); | |||
| 49604 | // Do not split 256 and 512 bit vectors with SSE2 as they overwrite original | |||
| 49605 | // value and require extra moves. | |||
| 49606 | if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 49607 | ((VT.is256BitVector() || VT.is512BitVector()) && Subtarget.hasAVX()))) | |||
| 49608 | return SDValue(); | |||
| 49609 | ||||
| 49610 | auto GetNot = [&DAG](SDValue V) { | |||
| 49611 | auto *SVN = dyn_cast<ShuffleVectorSDNode>(peekThroughOneUseBitcasts(V)); | |||
| 49612 | // TODO: SVN->hasOneUse() is a strong condition. It can be relaxed if all | |||
| 49613 | // end-users are ISD::AND including cases | |||
| 49614 | // (and(extract_vector_element(SVN), Y)). | |||
| 49615 | if (!SVN || !SVN->hasOneUse() || !SVN->isSplat() || | |||
| 49616 | !SVN->getOperand(1).isUndef()) { | |||
| 49617 | return SDValue(); | |||
| 49618 | } | |||
| 49619 | SDValue IVEN = SVN->getOperand(0); | |||
| 49620 | if (IVEN.getOpcode() != ISD::INSERT_VECTOR_ELT || | |||
| 49621 | !IVEN.getOperand(0).isUndef() || !IVEN.hasOneUse()) | |||
| 49622 | return SDValue(); | |||
| 49623 | if (!isa<ConstantSDNode>(IVEN.getOperand(2)) || | |||
| 49624 | IVEN.getConstantOperandAPInt(2) != SVN->getSplatIndex()) | |||
| 49625 | return SDValue(); | |||
| 49626 | SDValue Src = IVEN.getOperand(1); | |||
| 49627 | if (SDValue Not = IsNOT(Src, DAG)) { | |||
| 49628 | SDValue NotSrc = DAG.getBitcast(Src.getValueType(), Not); | |||
| 49629 | SDValue NotIVEN = | |||
| 49630 | DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(IVEN), IVEN.getValueType(), | |||
| 49631 | IVEN.getOperand(0), NotSrc, IVEN.getOperand(2)); | |||
| 49632 | return DAG.getVectorShuffle(SVN->getValueType(0), SDLoc(SVN), NotIVEN, | |||
| 49633 | SVN->getOperand(1), SVN->getMask()); | |||
| 49634 | } | |||
| 49635 | return SDValue(); | |||
| 49636 | }; | |||
| 49637 | ||||
| 49638 | SDValue X, Y; | |||
| 49639 | SDValue N0 = N->getOperand(0); | |||
| 49640 | SDValue N1 = N->getOperand(1); | |||
| 49641 | ||||
| 49642 | if (SDValue Not = GetNot(N0)) { | |||
| 49643 | X = Not; | |||
| 49644 | Y = N1; | |||
| 49645 | } else if (SDValue Not = GetNot(N1)) { | |||
| 49646 | X = Not; | |||
| 49647 | Y = N0; | |||
| 49648 | } else | |||
| 49649 | return SDValue(); | |||
| 49650 | ||||
| 49651 | X = DAG.getBitcast(VT, X); | |||
| 49652 | Y = DAG.getBitcast(VT, Y); | |||
| 49653 | SDLoc DL(N); | |||
| 49654 | // We do not split for SSE at all, but we need to split vectors for AVX1 and | |||
| 49655 | // AVX2. | |||
| 49656 | if (!Subtarget.useAVX512Regs() && VT.is512BitVector()) { | |||
| 49657 | SDValue LoX, HiX; | |||
| 49658 | std::tie(LoX, HiX) = splitVector(X, DAG, DL); | |||
| 49659 | SDValue LoY, HiY; | |||
| 49660 | std::tie(LoY, HiY) = splitVector(Y, DAG, DL); | |||
| 49661 | EVT SplitVT = LoX.getValueType(); | |||
| 49662 | SDValue LoV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {LoX, LoY}); | |||
| 49663 | SDValue HiV = DAG.getNode(X86ISD::ANDNP, DL, SplitVT, {HiX, HiY}); | |||
| 49664 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, {LoV, HiV}); | |||
| 49665 | } | |||
| 49666 | return DAG.getNode(X86ISD::ANDNP, DL, VT, {X, Y}); | |||
| 49667 | } | |||
| 49668 | ||||
| 49669 | // Try to widen AND, OR and XOR nodes to VT in order to remove casts around | |||
| 49670 | // logical operations, like in the example below. | |||
| 49671 | // or (and (truncate x, truncate y)), | |||
| 49672 | // (xor (truncate z, build_vector (constants))) | |||
| 49673 | // Given a target type \p VT, we generate | |||
| 49674 | // or (and x, y), (xor z, zext(build_vector (constants))) | |||
| 49675 | // given x, y and z are of type \p VT. We can do so, if operands are either | |||
| 49676 | // truncates from VT types, the second operand is a vector of constants or can | |||
| 49677 | // be recursively promoted. | |||
| 49678 | static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG, | |||
| 49679 | unsigned Depth) { | |||
| 49680 | // Limit recursion to avoid excessive compile times. | |||
| 49681 | if (Depth >= SelectionDAG::MaxRecursionDepth) | |||
| 49682 | return SDValue(); | |||
| 49683 | ||||
| 49684 | if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND && | |||
| 49685 | N->getOpcode() != ISD::OR) | |||
| 49686 | return SDValue(); | |||
| 49687 | ||||
| 49688 | SDValue N0 = N->getOperand(0); | |||
| 49689 | SDValue N1 = N->getOperand(1); | |||
| 49690 | SDLoc DL(N); | |||
| 49691 | ||||
| 49692 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 49693 | if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT)) | |||
| 49694 | return SDValue(); | |||
| 49695 | ||||
| 49696 | if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1)) | |||
| 49697 | N0 = NN0; | |||
| 49698 | else { | |||
| 49699 | // The Left side has to be a trunc. | |||
| 49700 | if (N0.getOpcode() != ISD::TRUNCATE) | |||
| 49701 | return SDValue(); | |||
| 49702 | ||||
| 49703 | // The type of the truncated inputs. | |||
| 49704 | if (N0.getOperand(0).getValueType() != VT) | |||
| 49705 | return SDValue(); | |||
| 49706 | ||||
| 49707 | N0 = N0.getOperand(0); | |||
| 49708 | } | |||
| 49709 | ||||
| 49710 | if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1)) | |||
| 49711 | N1 = NN1; | |||
| 49712 | else { | |||
| 49713 | // The right side has to be a 'trunc' or a constant vector. | |||
| 49714 | bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE && | |||
| 49715 | N1.getOperand(0).getValueType() == VT; | |||
| 49716 | if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) | |||
| 49717 | return SDValue(); | |||
| 49718 | ||||
| 49719 | if (RHSTrunc) | |||
| 49720 | N1 = N1.getOperand(0); | |||
| 49721 | else | |||
| 49722 | N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1); | |||
| 49723 | } | |||
| 49724 | ||||
| 49725 | return DAG.getNode(N->getOpcode(), DL, VT, N0, N1); | |||
| 49726 | } | |||
| 49727 | ||||
| 49728 | // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized | |||
| 49729 | // register. In most cases we actually compare or select YMM-sized registers | |||
| 49730 | // and mixing the two types creates horrible code. This method optimizes | |||
| 49731 | // some of the transition sequences. | |||
| 49732 | // Even with AVX-512 this is still useful for removing casts around logical | |||
| 49733 | // operations on vXi1 mask types. | |||
| 49734 | static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG, | |||
| 49735 | const X86Subtarget &Subtarget) { | |||
| 49736 | EVT VT = N->getValueType(0); | |||
| 49737 | assert(VT.isVector() && "Expected vector type")(static_cast <bool> (VT.isVector() && "Expected vector type" ) ? void (0) : __assert_fail ("VT.isVector() && \"Expected vector type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49737, __extension__ __PRETTY_FUNCTION__)); | |||
| 49738 | ||||
| 49739 | SDLoc DL(N); | |||
| 49740 | assert((N->getOpcode() == ISD::ANY_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode( ) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__ __PRETTY_FUNCTION__)) | |||
| 49741 | N->getOpcode() == ISD::ZERO_EXTEND ||(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode( ) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__ __PRETTY_FUNCTION__)) | |||
| 49742 | N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node")(static_cast <bool> ((N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode( ) == ISD::SIGN_EXTEND) && "Invalid Node") ? void (0) : __assert_fail ("(N->getOpcode() == ISD::ANY_EXTEND || N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::SIGN_EXTEND) && \"Invalid Node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49742, __extension__ __PRETTY_FUNCTION__)); | |||
| 49743 | ||||
| 49744 | SDValue Narrow = N->getOperand(0); | |||
| 49745 | EVT NarrowVT = Narrow.getValueType(); | |||
| 49746 | ||||
| 49747 | // Generate the wide operation. | |||
| 49748 | SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0); | |||
| 49749 | if (!Op) | |||
| 49750 | return SDValue(); | |||
| 49751 | switch (N->getOpcode()) { | |||
| 49752 | default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 49752); | |||
| 49753 | case ISD::ANY_EXTEND: | |||
| 49754 | return Op; | |||
| 49755 | case ISD::ZERO_EXTEND: | |||
| 49756 | return DAG.getZeroExtendInReg(Op, DL, NarrowVT); | |||
| 49757 | case ISD::SIGN_EXTEND: | |||
| 49758 | return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, | |||
| 49759 | Op, DAG.getValueType(NarrowVT)); | |||
| 49760 | } | |||
| 49761 | } | |||
| 49762 | ||||
| 49763 | static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) { | |||
| 49764 | unsigned FPOpcode; | |||
| 49765 | switch (Opcode) { | |||
| 49766 | default: llvm_unreachable("Unexpected input node for FP logic conversion")::llvm::llvm_unreachable_internal("Unexpected input node for FP logic conversion" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49766); | |||
| 49767 | case ISD::AND: FPOpcode = X86ISD::FAND; break; | |||
| 49768 | case ISD::OR: FPOpcode = X86ISD::FOR; break; | |||
| 49769 | case ISD::XOR: FPOpcode = X86ISD::FXOR; break; | |||
| 49770 | } | |||
| 49771 | return FPOpcode; | |||
| 49772 | } | |||
| 49773 | ||||
| 49774 | /// If both input operands of a logic op are being cast from floating-point | |||
| 49775 | /// types or FP compares, try to convert this into a floating-point logic node | |||
| 49776 | /// to avoid unnecessary moves from SSE to integer registers. | |||
| 49777 | static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG, | |||
| 49778 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 49779 | const X86Subtarget &Subtarget) { | |||
| 49780 | EVT VT = N->getValueType(0); | |||
| 49781 | SDValue N0 = N->getOperand(0); | |||
| 49782 | SDValue N1 = N->getOperand(1); | |||
| 49783 | SDLoc DL(N); | |||
| 49784 | ||||
| 49785 | if (!((N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST) || | |||
| 49786 | (N0.getOpcode() == ISD::SETCC && N1.getOpcode() == ISD::SETCC))) | |||
| 49787 | return SDValue(); | |||
| 49788 | ||||
| 49789 | SDValue N00 = N0.getOperand(0); | |||
| 49790 | SDValue N10 = N1.getOperand(0); | |||
| 49791 | EVT N00Type = N00.getValueType(); | |||
| 49792 | EVT N10Type = N10.getValueType(); | |||
| 49793 | ||||
| 49794 | // Ensure that both types are the same and are legal scalar fp types. | |||
| 49795 | if (N00Type != N10Type || !((Subtarget.hasSSE1() && N00Type == MVT::f32) || | |||
| 49796 | (Subtarget.hasSSE2() && N00Type == MVT::f64) || | |||
| 49797 | (Subtarget.hasFP16() && N00Type == MVT::f16))) | |||
| 49798 | return SDValue(); | |||
| 49799 | ||||
| 49800 | if (N0.getOpcode() == ISD::BITCAST && !DCI.isBeforeLegalizeOps()) { | |||
| 49801 | unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode()); | |||
| 49802 | SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10); | |||
| 49803 | return DAG.getBitcast(VT, FPLogic); | |||
| 49804 | } | |||
| 49805 | ||||
| 49806 | if (VT != MVT::i1 || N0.getOpcode() != ISD::SETCC || !N0.hasOneUse() || | |||
| 49807 | !N1.hasOneUse()) | |||
| 49808 | return SDValue(); | |||
| 49809 | ||||
| 49810 | ISD::CondCode CC0 = cast<CondCodeSDNode>(N0.getOperand(2))->get(); | |||
| 49811 | ISD::CondCode CC1 = cast<CondCodeSDNode>(N1.getOperand(2))->get(); | |||
| 49812 | ||||
| 49813 | // The vector ISA for FP predicates is incomplete before AVX, so converting | |||
| 49814 | // COMIS* to CMPS* may not be a win before AVX. | |||
| 49815 | if (!Subtarget.hasAVX() && | |||
| 49816 | !(cheapX86FSETCC_SSE(CC0) && cheapX86FSETCC_SSE(CC1))) | |||
| 49817 | return SDValue(); | |||
| 49818 | ||||
| 49819 | // Convert scalar FP compares and logic to vector compares (COMIS* to CMPS*) | |||
| 49820 | // and vector logic: | |||
| 49821 | // logic (setcc N00, N01), (setcc N10, N11) --> | |||
| 49822 | // extelt (logic (setcc (s2v N00), (s2v N01)), setcc (s2v N10), (s2v N11))), 0 | |||
| 49823 | unsigned NumElts = 128 / N00Type.getSizeInBits(); | |||
| 49824 | EVT VecVT = EVT::getVectorVT(*DAG.getContext(), N00Type, NumElts); | |||
| 49825 | EVT BoolVecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); | |||
| 49826 | SDValue ZeroIndex = DAG.getVectorIdxConstant(0, DL); | |||
| 49827 | SDValue N01 = N0.getOperand(1); | |||
| 49828 | SDValue N11 = N1.getOperand(1); | |||
| 49829 | SDValue Vec00 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N00); | |||
| 49830 | SDValue Vec01 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N01); | |||
| 49831 | SDValue Vec10 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N10); | |||
| 49832 | SDValue Vec11 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, N11); | |||
| 49833 | SDValue Setcc0 = DAG.getSetCC(DL, BoolVecVT, Vec00, Vec01, CC0); | |||
| 49834 | SDValue Setcc1 = DAG.getSetCC(DL, BoolVecVT, Vec10, Vec11, CC1); | |||
| 49835 | SDValue Logic = DAG.getNode(N->getOpcode(), DL, BoolVecVT, Setcc0, Setcc1); | |||
| 49836 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Logic, ZeroIndex); | |||
| 49837 | } | |||
| 49838 | ||||
| 49839 | // Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y)) | |||
| 49840 | // to reduce XMM->GPR traffic. | |||
| 49841 | static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) { | |||
| 49842 | unsigned Opc = N->getOpcode(); | |||
| 49843 | assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode") ? void (0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__ __PRETTY_FUNCTION__)) | |||
| 49844 | "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode") ? void (0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49844, __extension__ __PRETTY_FUNCTION__)); | |||
| 49845 | ||||
| 49846 | SDValue N0 = N->getOperand(0); | |||
| 49847 | SDValue N1 = N->getOperand(1); | |||
| 49848 | ||||
| 49849 | // Both operands must be single use MOVMSK. | |||
| 49850 | if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() || | |||
| 49851 | N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse()) | |||
| 49852 | return SDValue(); | |||
| 49853 | ||||
| 49854 | SDValue Vec0 = N0.getOperand(0); | |||
| 49855 | SDValue Vec1 = N1.getOperand(0); | |||
| 49856 | EVT VecVT0 = Vec0.getValueType(); | |||
| 49857 | EVT VecVT1 = Vec1.getValueType(); | |||
| 49858 | ||||
| 49859 | // Both MOVMSK operands must be from vectors of the same size and same element | |||
| 49860 | // size, but its OK for a fp/int diff. | |||
| 49861 | if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() || | |||
| 49862 | VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits()) | |||
| 49863 | return SDValue(); | |||
| 49864 | ||||
| 49865 | SDLoc DL(N); | |||
| 49866 | unsigned VecOpc = | |||
| 49867 | VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc; | |||
| 49868 | SDValue Result = | |||
| 49869 | DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1)); | |||
| 49870 | return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result); | |||
| 49871 | } | |||
| 49872 | ||||
| 49873 | // Attempt to fold BITOP(SHIFT(X,Z),SHIFT(Y,Z)) -> SHIFT(BITOP(X,Y),Z). | |||
| 49874 | // NOTE: This is a very limited case of what SimplifyUsingDistributiveLaws | |||
| 49875 | // handles in InstCombine. | |||
| 49876 | static SDValue combineBitOpWithShift(SDNode *N, SelectionDAG &DAG) { | |||
| 49877 | unsigned Opc = N->getOpcode(); | |||
| 49878 | assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode") ? void (0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__ __PRETTY_FUNCTION__)) | |||
| 49879 | "Unexpected bit opcode")(static_cast <bool> ((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && "Unexpected bit opcode") ? void (0) : __assert_fail ("(Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) && \"Unexpected bit opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 49879, __extension__ __PRETTY_FUNCTION__)); | |||
| 49880 | ||||
| 49881 | SDValue N0 = N->getOperand(0); | |||
| 49882 | SDValue N1 = N->getOperand(1); | |||
| 49883 | EVT VT = N->getValueType(0); | |||
| 49884 | ||||
| 49885 | // Both operands must be single use. | |||
| 49886 | if (!N0.hasOneUse() || !N1.hasOneUse()) | |||
| 49887 | return SDValue(); | |||
| 49888 | ||||
| 49889 | // Search for matching shifts. | |||
| 49890 | SDValue BC0 = peekThroughOneUseBitcasts(N0); | |||
| 49891 | SDValue BC1 = peekThroughOneUseBitcasts(N1); | |||
| 49892 | ||||
| 49893 | unsigned BCOpc = BC0.getOpcode(); | |||
| 49894 | EVT BCVT = BC0.getValueType(); | |||
| 49895 | if (BCOpc != BC1->getOpcode() || BCVT != BC1.getValueType()) | |||
| 49896 | return SDValue(); | |||
| 49897 | ||||
| 49898 | switch (BCOpc) { | |||
| 49899 | case X86ISD::VSHLI: | |||
| 49900 | case X86ISD::VSRLI: | |||
| 49901 | case X86ISD::VSRAI: { | |||
| 49902 | if (BC0.getOperand(1) != BC1.getOperand(1)) | |||
| 49903 | return SDValue(); | |||
| 49904 | ||||
| 49905 | SDLoc DL(N); | |||
| 49906 | SDValue BitOp = | |||
| 49907 | DAG.getNode(Opc, DL, BCVT, BC0.getOperand(0), BC1.getOperand(0)); | |||
| 49908 | SDValue Shift = DAG.getNode(BCOpc, DL, BCVT, BitOp, BC0.getOperand(1)); | |||
| 49909 | return DAG.getBitcast(VT, Shift); | |||
| 49910 | } | |||
| 49911 | } | |||
| 49912 | ||||
| 49913 | return SDValue(); | |||
| 49914 | } | |||
| 49915 | ||||
| 49916 | /// If this is a zero/all-bits result that is bitwise-anded with a low bits | |||
| 49917 | /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and' | |||
| 49918 | /// with a shift-right to eliminate loading the vector constant mask value. | |||
| 49919 | static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, | |||
| 49920 | const X86Subtarget &Subtarget) { | |||
| 49921 | SDValue Op0 = peekThroughBitcasts(N->getOperand(0)); | |||
| 49922 | SDValue Op1 = peekThroughBitcasts(N->getOperand(1)); | |||
| 49923 | EVT VT = Op0.getValueType(); | |||
| 49924 | if (VT != Op1.getValueType() || !VT.isSimple() || !VT.isInteger()) | |||
| 49925 | return SDValue(); | |||
| 49926 | ||||
| 49927 | // Try to convert an "is positive" signbit masking operation into arithmetic | |||
| 49928 | // shift and "andn". This saves a materialization of a -1 vector constant. | |||
| 49929 | // The "is negative" variant should be handled more generally because it only | |||
| 49930 | // requires "and" rather than "andn": | |||
| 49931 | // and (pcmpgt X, -1), Y --> pandn (vsrai X, BitWidth - 1), Y | |||
| 49932 | // | |||
| 49933 | // This is limited to the original type to avoid producing even more bitcasts. | |||
| 49934 | // If the bitcasts can't be eliminated, then it is unlikely that this fold | |||
| 49935 | // will be profitable. | |||
| 49936 | if (N->getValueType(0) == VT && | |||
| 49937 | supportedVectorShiftWithImm(VT, Subtarget, ISD::SRA)) { | |||
| 49938 | SDValue X, Y; | |||
| 49939 | if (Op1.getOpcode() == X86ISD::PCMPGT && | |||
| 49940 | isAllOnesOrAllOnesSplat(Op1.getOperand(1)) && Op1.hasOneUse()) { | |||
| 49941 | X = Op1.getOperand(0); | |||
| 49942 | Y = Op0; | |||
| 49943 | } else if (Op0.getOpcode() == X86ISD::PCMPGT && | |||
| 49944 | isAllOnesOrAllOnesSplat(Op0.getOperand(1)) && Op0.hasOneUse()) { | |||
| 49945 | X = Op0.getOperand(0); | |||
| 49946 | Y = Op1; | |||
| 49947 | } | |||
| 49948 | if (X && Y) { | |||
| 49949 | SDLoc DL(N); | |||
| 49950 | SDValue Sra = | |||
| 49951 | getTargetVShiftByConstNode(X86ISD::VSRAI, DL, VT.getSimpleVT(), X, | |||
| 49952 | VT.getScalarSizeInBits() - 1, DAG); | |||
| 49953 | return DAG.getNode(X86ISD::ANDNP, DL, VT, Sra, Y); | |||
| 49954 | } | |||
| 49955 | } | |||
| 49956 | ||||
| 49957 | APInt SplatVal; | |||
| 49958 | if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || | |||
| 49959 | !SplatVal.isMask()) | |||
| 49960 | return SDValue(); | |||
| 49961 | ||||
| 49962 | // Don't prevent creation of ANDN. | |||
| 49963 | if (isBitwiseNot(Op0)) | |||
| 49964 | return SDValue(); | |||
| 49965 | ||||
| 49966 | if (!supportedVectorShiftWithImm(VT, Subtarget, ISD::SRL)) | |||
| 49967 | return SDValue(); | |||
| 49968 | ||||
| 49969 | unsigned EltBitWidth = VT.getScalarSizeInBits(); | |||
| 49970 | if (EltBitWidth != DAG.ComputeNumSignBits(Op0)) | |||
| 49971 | return SDValue(); | |||
| 49972 | ||||
| 49973 | SDLoc DL(N); | |||
| 49974 | unsigned ShiftVal = SplatVal.countr_one(); | |||
| 49975 | SDValue ShAmt = DAG.getTargetConstant(EltBitWidth - ShiftVal, DL, MVT::i8); | |||
| 49976 | SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT, Op0, ShAmt); | |||
| 49977 | return DAG.getBitcast(N->getValueType(0), Shift); | |||
| 49978 | } | |||
| 49979 | ||||
| 49980 | // Get the index node from the lowered DAG of a GEP IR instruction with one | |||
| 49981 | // indexing dimension. | |||
| 49982 | static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { | |||
| 49983 | if (Ld->isIndexed()) | |||
| 49984 | return SDValue(); | |||
| 49985 | ||||
| 49986 | SDValue Base = Ld->getBasePtr(); | |||
| 49987 | ||||
| 49988 | if (Base.getOpcode() != ISD::ADD) | |||
| 49989 | return SDValue(); | |||
| 49990 | ||||
| 49991 | SDValue ShiftedIndex = Base.getOperand(0); | |||
| 49992 | ||||
| 49993 | if (ShiftedIndex.getOpcode() != ISD::SHL) | |||
| 49994 | return SDValue(); | |||
| 49995 | ||||
| 49996 | return ShiftedIndex.getOperand(0); | |||
| 49997 | ||||
| 49998 | } | |||
| 49999 | ||||
| 50000 | static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { | |||
| 50001 | if (Subtarget.hasBMI2() && VT.isScalarInteger()) { | |||
| 50002 | switch (VT.getSizeInBits()) { | |||
| 50003 | default: return false; | |||
| 50004 | case 64: return Subtarget.is64Bit() ? true : false; | |||
| 50005 | case 32: return true; | |||
| 50006 | } | |||
| 50007 | } | |||
| 50008 | return false; | |||
| 50009 | } | |||
| 50010 | ||||
| 50011 | // This function recognizes cases where X86 bzhi instruction can replace and | |||
| 50012 | // 'and-load' sequence. | |||
| 50013 | // In case of loading integer value from an array of constants which is defined | |||
| 50014 | // as follows: | |||
| 50015 | // | |||
| 50016 | // int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} | |||
| 50017 | // | |||
| 50018 | // then applying a bitwise and on the result with another input. | |||
| 50019 | // It's equivalent to performing bzhi (zero high bits) on the input, with the | |||
| 50020 | // same index of the load. | |||
| 50021 | static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, | |||
| 50022 | const X86Subtarget &Subtarget) { | |||
| 50023 | MVT VT = Node->getSimpleValueType(0); | |||
| 50024 | SDLoc dl(Node); | |||
| 50025 | ||||
| 50026 | // Check if subtarget has BZHI instruction for the node's type | |||
| 50027 | if (!hasBZHI(Subtarget, VT)) | |||
| 50028 | return SDValue(); | |||
| 50029 | ||||
| 50030 | // Try matching the pattern for both operands. | |||
| 50031 | for (unsigned i = 0; i < 2; i++) { | |||
| 50032 | SDValue N = Node->getOperand(i); | |||
| 50033 | LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode()); | |||
| 50034 | ||||
| 50035 | // continue if the operand is not a load instruction | |||
| 50036 | if (!Ld) | |||
| 50037 | return SDValue(); | |||
| 50038 | ||||
| 50039 | const Value *MemOp = Ld->getMemOperand()->getValue(); | |||
| 50040 | ||||
| 50041 | if (!MemOp) | |||
| 50042 | return SDValue(); | |||
| 50043 | ||||
| 50044 | if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) { | |||
| 50045 | if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) { | |||
| 50046 | if (GV->isConstant() && GV->hasDefinitiveInitializer()) { | |||
| 50047 | ||||
| 50048 | Constant *Init = GV->getInitializer(); | |||
| 50049 | Type *Ty = Init->getType(); | |||
| 50050 | if (!isa<ConstantDataArray>(Init) || | |||
| 50051 | !Ty->getArrayElementType()->isIntegerTy() || | |||
| 50052 | Ty->getArrayElementType()->getScalarSizeInBits() != | |||
| 50053 | VT.getSizeInBits() || | |||
| 50054 | Ty->getArrayNumElements() > | |||
| 50055 | Ty->getArrayElementType()->getScalarSizeInBits()) | |||
| 50056 | continue; | |||
| 50057 | ||||
| 50058 | // Check if the array's constant elements are suitable to our case. | |||
| 50059 | uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); | |||
| 50060 | bool ConstantsMatch = true; | |||
| 50061 | for (uint64_t j = 0; j < ArrayElementCount; j++) { | |||
| 50062 | auto *Elem = cast<ConstantInt>(Init->getAggregateElement(j)); | |||
| 50063 | if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { | |||
| 50064 | ConstantsMatch = false; | |||
| 50065 | break; | |||
| 50066 | } | |||
| 50067 | } | |||
| 50068 | if (!ConstantsMatch) | |||
| 50069 | continue; | |||
| 50070 | ||||
| 50071 | // Do the transformation (For 32-bit type): | |||
| 50072 | // -> (and (load arr[idx]), inp) | |||
| 50073 | // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) | |||
| 50074 | // that will be replaced with one bzhi instruction. | |||
| 50075 | SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); | |||
| 50076 | SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, MVT::i32); | |||
| 50077 | ||||
| 50078 | // Get the Node which indexes into the array. | |||
| 50079 | SDValue Index = getIndexFromUnindexedLoad(Ld); | |||
| 50080 | if (!Index) | |||
| 50081 | return SDValue(); | |||
| 50082 | Index = DAG.getZExtOrTrunc(Index, dl, MVT::i32); | |||
| 50083 | ||||
| 50084 | SDValue Sub = DAG.getNode(ISD::SUB, dl, MVT::i32, SizeC, Index); | |||
| 50085 | Sub = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Sub); | |||
| 50086 | ||||
| 50087 | SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); | |||
| 50088 | SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); | |||
| 50089 | ||||
| 50090 | return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); | |||
| 50091 | } | |||
| 50092 | } | |||
| 50093 | } | |||
| 50094 | } | |||
| 50095 | return SDValue(); | |||
| 50096 | } | |||
| 50097 | ||||
| 50098 | // Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C) | |||
| 50099 | // Where C is a mask containing the same number of bits as the setcc and | |||
| 50100 | // where the setcc will freely 0 upper bits of k-register. We can replace the | |||
| 50101 | // undef in the concat with 0s and remove the AND. This mainly helps with | |||
| 50102 | // v2i1/v4i1 setcc being casted to scalar. | |||
| 50103 | static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG, | |||
| 50104 | const X86Subtarget &Subtarget) { | |||
| 50105 | assert(N->getOpcode() == ISD::AND && "Unexpected opcode!")(static_cast <bool> (N->getOpcode() == ISD::AND && "Unexpected opcode!") ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND && \"Unexpected opcode!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50105, __extension__ __PRETTY_FUNCTION__)); | |||
| 50106 | ||||
| 50107 | EVT VT = N->getValueType(0); | |||
| 50108 | ||||
| 50109 | // Make sure this is an AND with constant. We will check the value of the | |||
| 50110 | // constant later. | |||
| 50111 | auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(1)); | |||
| 50112 | if (!C1) | |||
| 50113 | return SDValue(); | |||
| 50114 | ||||
| 50115 | // This is implied by the ConstantSDNode. | |||
| 50116 | assert(!VT.isVector() && "Expected scalar VT!")(static_cast <bool> (!VT.isVector() && "Expected scalar VT!" ) ? void (0) : __assert_fail ("!VT.isVector() && \"Expected scalar VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50116, __extension__ __PRETTY_FUNCTION__)); | |||
| 50117 | ||||
| 50118 | SDValue Src = N->getOperand(0); | |||
| 50119 | if (!Src.hasOneUse()) | |||
| 50120 | return SDValue(); | |||
| 50121 | ||||
| 50122 | // (Optionally) peek through any_extend(). | |||
| 50123 | if (Src.getOpcode() == ISD::ANY_EXTEND) { | |||
| 50124 | if (!Src.getOperand(0).hasOneUse()) | |||
| 50125 | return SDValue(); | |||
| 50126 | Src = Src.getOperand(0); | |||
| 50127 | } | |||
| 50128 | ||||
| 50129 | if (Src.getOpcode() != ISD::BITCAST || !Src.getOperand(0).hasOneUse()) | |||
| 50130 | return SDValue(); | |||
| 50131 | ||||
| 50132 | Src = Src.getOperand(0); | |||
| 50133 | EVT SrcVT = Src.getValueType(); | |||
| 50134 | ||||
| 50135 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 50136 | if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 || | |||
| 50137 | !TLI.isTypeLegal(SrcVT)) | |||
| 50138 | return SDValue(); | |||
| 50139 | ||||
| 50140 | if (Src.getOpcode() != ISD::CONCAT_VECTORS) | |||
| 50141 | return SDValue(); | |||
| 50142 | ||||
| 50143 | // We only care about the first subvector of the concat, we expect the | |||
| 50144 | // other subvectors to be ignored due to the AND if we make the change. | |||
| 50145 | SDValue SubVec = Src.getOperand(0); | |||
| 50146 | EVT SubVecVT = SubVec.getValueType(); | |||
| 50147 | ||||
| 50148 | // The RHS of the AND should be a mask with as many bits as SubVec. | |||
| 50149 | if (!TLI.isTypeLegal(SubVecVT) || | |||
| 50150 | !C1->getAPIntValue().isMask(SubVecVT.getVectorNumElements())) | |||
| 50151 | return SDValue(); | |||
| 50152 | ||||
| 50153 | // First subvector should be a setcc with a legal result type or a | |||
| 50154 | // AND containing at least one setcc with a legal result type. | |||
| 50155 | auto IsLegalSetCC = [&](SDValue V) { | |||
| 50156 | if (V.getOpcode() != ISD::SETCC) | |||
| 50157 | return false; | |||
| 50158 | EVT SetccVT = V.getOperand(0).getValueType(); | |||
| 50159 | if (!TLI.isTypeLegal(SetccVT) || | |||
| 50160 | !(Subtarget.hasVLX() || SetccVT.is512BitVector())) | |||
| 50161 | return false; | |||
| 50162 | if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32)) | |||
| 50163 | return false; | |||
| 50164 | return true; | |||
| 50165 | }; | |||
| 50166 | if (!(IsLegalSetCC(SubVec) || (SubVec.getOpcode() == ISD::AND && | |||
| 50167 | (IsLegalSetCC(SubVec.getOperand(0)) || | |||
| 50168 | IsLegalSetCC(SubVec.getOperand(1)))))) | |||
| 50169 | return SDValue(); | |||
| 50170 | ||||
| 50171 | // We passed all the checks. Rebuild the concat_vectors with zeroes | |||
| 50172 | // and cast it back to VT. | |||
| 50173 | SDLoc dl(N); | |||
| 50174 | SmallVector<SDValue, 4> Ops(Src.getNumOperands(), | |||
| 50175 | DAG.getConstant(0, dl, SubVecVT)); | |||
| 50176 | Ops[0] = SubVec; | |||
| 50177 | SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, | |||
| 50178 | Ops); | |||
| 50179 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcVT.getSizeInBits()); | |||
| 50180 | return DAG.getZExtOrTrunc(DAG.getBitcast(IntVT, Concat), dl, VT); | |||
| 50181 | } | |||
| 50182 | ||||
| 50183 | static SDValue getBMIMatchingOp(unsigned Opc, SelectionDAG &DAG, | |||
| 50184 | SDValue OpMustEq, SDValue Op, unsigned Depth) { | |||
| 50185 | // We don't want to go crazy with the recursion here. This isn't a super | |||
| 50186 | // important optimization. | |||
| 50187 | static constexpr unsigned kMaxDepth = 2; | |||
| 50188 | ||||
| 50189 | // Only do this re-ordering if op has one use. | |||
| 50190 | if (!Op.hasOneUse()) | |||
| 50191 | return SDValue(); | |||
| 50192 | ||||
| 50193 | SDLoc DL(Op); | |||
| 50194 | // If we hit another assosiative op, recurse further. | |||
| 50195 | if (Op.getOpcode() == Opc) { | |||
| 50196 | // Done recursing. | |||
| 50197 | if (Depth++ >= kMaxDepth) | |||
| 50198 | return SDValue(); | |||
| 50199 | ||||
| 50200 | for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) | |||
| 50201 | if (SDValue R = | |||
| 50202 | getBMIMatchingOp(Opc, DAG, OpMustEq, Op.getOperand(OpIdx), Depth)) | |||
| 50203 | return DAG.getNode(Op.getOpcode(), DL, Op.getValueType(), R, | |||
| 50204 | Op.getOperand(1 - OpIdx)); | |||
| 50205 | ||||
| 50206 | } else if (Op.getOpcode() == ISD::SUB) { | |||
| 50207 | if (Opc == ISD::AND) { | |||
| 50208 | // BLSI: (and x, (sub 0, x)) | |||
| 50209 | if (isNullConstant(Op.getOperand(0)) && Op.getOperand(1) == OpMustEq) | |||
| 50210 | return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); | |||
| 50211 | } | |||
| 50212 | // Opc must be ISD::AND or ISD::XOR | |||
| 50213 | // BLSR: (and x, (sub x, 1)) | |||
| 50214 | // BLSMSK: (xor x, (sub x, 1)) | |||
| 50215 | if (isOneConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq) | |||
| 50216 | return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); | |||
| 50217 | ||||
| 50218 | } else if (Op.getOpcode() == ISD::ADD) { | |||
| 50219 | // Opc must be ISD::AND or ISD::XOR | |||
| 50220 | // BLSR: (and x, (add x, -1)) | |||
| 50221 | // BLSMSK: (xor x, (add x, -1)) | |||
| 50222 | if (isAllOnesConstant(Op.getOperand(1)) && Op.getOperand(0) == OpMustEq) | |||
| 50223 | return DAG.getNode(Opc, DL, Op.getValueType(), OpMustEq, Op); | |||
| 50224 | } | |||
| 50225 | return SDValue(); | |||
| 50226 | } | |||
| 50227 | ||||
| 50228 | static SDValue combineBMILogicOp(SDNode *N, SelectionDAG &DAG, | |||
| 50229 | const X86Subtarget &Subtarget) { | |||
| 50230 | EVT VT = N->getValueType(0); | |||
| 50231 | // Make sure this node is a candidate for BMI instructions. | |||
| 50232 | if (!Subtarget.hasBMI() || !VT.isScalarInteger() || | |||
| 50233 | (VT != MVT::i32 && VT != MVT::i64)) | |||
| 50234 | return SDValue(); | |||
| 50235 | ||||
| 50236 | assert(N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR)(static_cast <bool> (N->getOpcode() == ISD::AND || N ->getOpcode() == ISD::XOR) ? void (0) : __assert_fail ("N->getOpcode() == ISD::AND || N->getOpcode() == ISD::XOR" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50236, __extension__ __PRETTY_FUNCTION__)); | |||
| 50237 | ||||
| 50238 | // Try and match LHS and RHS. | |||
| 50239 | for (unsigned OpIdx = 0; OpIdx < 2; ++OpIdx) | |||
| 50240 | if (SDValue OpMatch = | |||
| 50241 | getBMIMatchingOp(N->getOpcode(), DAG, N->getOperand(OpIdx), | |||
| 50242 | N->getOperand(1 - OpIdx), 0)) | |||
| 50243 | return OpMatch; | |||
| 50244 | return SDValue(); | |||
| 50245 | } | |||
| 50246 | ||||
| 50247 | static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, | |||
| 50248 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 50249 | const X86Subtarget &Subtarget) { | |||
| 50250 | SDValue N0 = N->getOperand(0); | |||
| 50251 | SDValue N1 = N->getOperand(1); | |||
| 50252 | EVT VT = N->getValueType(0); | |||
| 50253 | SDLoc dl(N); | |||
| 50254 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 50255 | ||||
| 50256 | // If this is SSE1 only convert to FAND to avoid scalarization. | |||
| 50257 | if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { | |||
| 50258 | return DAG.getBitcast(MVT::v4i32, | |||
| 50259 | DAG.getNode(X86ISD::FAND, dl, MVT::v4f32, | |||
| 50260 | DAG.getBitcast(MVT::v4f32, N0), | |||
| 50261 | DAG.getBitcast(MVT::v4f32, N1))); | |||
| 50262 | } | |||
| 50263 | ||||
| 50264 | // Use a 32-bit and+zext if upper bits known zero. | |||
| 50265 | if (VT == MVT::i64 && Subtarget.is64Bit() && !isa<ConstantSDNode>(N1)) { | |||
| 50266 | APInt HiMask = APInt::getHighBitsSet(64, 32); | |||
| 50267 | if (DAG.MaskedValueIsZero(N1, HiMask) || | |||
| 50268 | DAG.MaskedValueIsZero(N0, HiMask)) { | |||
| 50269 | SDValue LHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N0); | |||
| 50270 | SDValue RHS = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, N1); | |||
| 50271 | return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, | |||
| 50272 | DAG.getNode(ISD::AND, dl, MVT::i32, LHS, RHS)); | |||
| 50273 | } | |||
| 50274 | } | |||
| 50275 | ||||
| 50276 | // Match all-of bool scalar reductions into a bitcast/movmsk + cmp. | |||
| 50277 | // TODO: Support multiple SrcOps. | |||
| 50278 | if (VT == MVT::i1) { | |||
| 50279 | SmallVector<SDValue, 2> SrcOps; | |||
| 50280 | SmallVector<APInt, 2> SrcPartials; | |||
| 50281 | if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) && | |||
| 50282 | SrcOps.size() == 1) { | |||
| 50283 | unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); | |||
| 50284 | EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); | |||
| 50285 | SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); | |||
| 50286 | if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) | |||
| 50287 | Mask = DAG.getBitcast(MaskVT, SrcOps[0]); | |||
| 50288 | if (Mask) { | |||
| 50289 | assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask") ? void (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__ __PRETTY_FUNCTION__)) | |||
| 50290 | "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask") ? void (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50290, __extension__ __PRETTY_FUNCTION__)); | |||
| 50291 | SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); | |||
| 50292 | Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); | |||
| 50293 | return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ); | |||
| 50294 | } | |||
| 50295 | } | |||
| 50296 | } | |||
| 50297 | ||||
| 50298 | if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget)) | |||
| 50299 | return V; | |||
| 50300 | ||||
| 50301 | if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) | |||
| 50302 | return R; | |||
| 50303 | ||||
| 50304 | if (SDValue R = combineBitOpWithShift(N, DAG)) | |||
| 50305 | return R; | |||
| 50306 | ||||
| 50307 | if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) | |||
| 50308 | return FPLogic; | |||
| 50309 | ||||
| 50310 | if (SDValue R = combineAndShuffleNot(N, DAG, Subtarget)) | |||
| 50311 | return R; | |||
| 50312 | ||||
| 50313 | if (DCI.isBeforeLegalizeOps()) | |||
| 50314 | return SDValue(); | |||
| 50315 | ||||
| 50316 | if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) | |||
| 50317 | return R; | |||
| 50318 | ||||
| 50319 | if (SDValue R = combineAndNotIntoANDNP(N, DAG)) | |||
| 50320 | return R; | |||
| 50321 | ||||
| 50322 | if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) | |||
| 50323 | return ShiftRight; | |||
| 50324 | ||||
| 50325 | if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) | |||
| 50326 | return R; | |||
| 50327 | ||||
| 50328 | // fold (and (mul x, c1), c2) -> (mul x, (and c1, c2)) | |||
| 50329 | // iff c2 is all/no bits mask - i.e. a select-with-zero mask. | |||
| 50330 | // TODO: Handle PMULDQ/PMULUDQ/VPMADDWD/VPMADDUBSW? | |||
| 50331 | if (VT.isVector() && getTargetConstantFromNode(N1)) { | |||
| 50332 | unsigned Opc0 = N0.getOpcode(); | |||
| 50333 | if ((Opc0 == ISD::MUL || Opc0 == ISD::MULHU || Opc0 == ISD::MULHS) && | |||
| 50334 | getTargetConstantFromNode(N0.getOperand(1)) && | |||
| 50335 | DAG.ComputeNumSignBits(N1) == VT.getScalarSizeInBits() && | |||
| 50336 | N0->hasOneUse() && N0.getOperand(1)->hasOneUse()) { | |||
| 50337 | SDValue MaskMul = DAG.getNode(ISD::AND, dl, VT, N0.getOperand(1), N1); | |||
| 50338 | return DAG.getNode(Opc0, dl, VT, N0.getOperand(0), MaskMul); | |||
| 50339 | } | |||
| 50340 | } | |||
| 50341 | ||||
| 50342 | // Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y), COND_B) iff Y is not a constant | |||
| 50343 | // avoids slow variable shift (moving shift amount to ECX etc.) | |||
| 50344 | if (isOneConstant(N1) && N0->hasOneUse()) { | |||
| 50345 | SDValue Src = N0; | |||
| 50346 | while ((Src.getOpcode() == ISD::ZERO_EXTEND || | |||
| 50347 | Src.getOpcode() == ISD::TRUNCATE) && | |||
| 50348 | Src.getOperand(0)->hasOneUse()) | |||
| 50349 | Src = Src.getOperand(0); | |||
| 50350 | bool ContainsNOT = false; | |||
| 50351 | X86::CondCode X86CC = X86::COND_B; | |||
| 50352 | // Peek through AND(NOT(SRL(X,Y)),1). | |||
| 50353 | if (isBitwiseNot(Src)) { | |||
| 50354 | Src = Src.getOperand(0); | |||
| 50355 | X86CC = X86::COND_AE; | |||
| 50356 | ContainsNOT = true; | |||
| 50357 | } | |||
| 50358 | if (Src.getOpcode() == ISD::SRL && | |||
| 50359 | !isa<ConstantSDNode>(Src.getOperand(1))) { | |||
| 50360 | SDValue BitNo = Src.getOperand(1); | |||
| 50361 | Src = Src.getOperand(0); | |||
| 50362 | // Peek through AND(SRL(NOT(X),Y),1). | |||
| 50363 | if (isBitwiseNot(Src)) { | |||
| 50364 | Src = Src.getOperand(0); | |||
| 50365 | X86CC = X86CC == X86::COND_AE ? X86::COND_B : X86::COND_AE; | |||
| 50366 | ContainsNOT = true; | |||
| 50367 | } | |||
| 50368 | // If we have BMI2 then SHRX should be faster for i32/i64 cases. | |||
| 50369 | if (!(Subtarget.hasBMI2() && !ContainsNOT && VT.getSizeInBits() >= 32)) | |||
| 50370 | if (SDValue BT = getBT(Src, BitNo, dl, DAG)) | |||
| 50371 | return DAG.getZExtOrTrunc(getSETCC(X86CC, BT, dl, DAG), dl, VT); | |||
| 50372 | } | |||
| 50373 | } | |||
| 50374 | ||||
| 50375 | if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { | |||
| 50376 | // Attempt to recursively combine a bitmask AND with shuffles. | |||
| 50377 | SDValue Op(N, 0); | |||
| 50378 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 50379 | return Res; | |||
| 50380 | ||||
| 50381 | // If either operand is a constant mask, then only the elements that aren't | |||
| 50382 | // zero are actually demanded by the other operand. | |||
| 50383 | auto GetDemandedMasks = [&](SDValue Op) { | |||
| 50384 | APInt UndefElts; | |||
| 50385 | SmallVector<APInt> EltBits; | |||
| 50386 | int NumElts = VT.getVectorNumElements(); | |||
| 50387 | int EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 50388 | APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); | |||
| 50389 | APInt DemandedElts = APInt::getAllOnes(NumElts); | |||
| 50390 | if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, | |||
| 50391 | EltBits)) { | |||
| 50392 | DemandedBits.clearAllBits(); | |||
| 50393 | DemandedElts.clearAllBits(); | |||
| 50394 | for (int I = 0; I != NumElts; ++I) { | |||
| 50395 | if (UndefElts[I]) { | |||
| 50396 | // We can't assume an undef src element gives an undef dst - the | |||
| 50397 | // other src might be zero. | |||
| 50398 | DemandedBits.setAllBits(); | |||
| 50399 | DemandedElts.setBit(I); | |||
| 50400 | } else if (!EltBits[I].isZero()) { | |||
| 50401 | DemandedBits |= EltBits[I]; | |||
| 50402 | DemandedElts.setBit(I); | |||
| 50403 | } | |||
| 50404 | } | |||
| 50405 | } | |||
| 50406 | return std::make_pair(DemandedBits, DemandedElts); | |||
| 50407 | }; | |||
| 50408 | APInt Bits0, Elts0; | |||
| 50409 | APInt Bits1, Elts1; | |||
| 50410 | std::tie(Bits0, Elts0) = GetDemandedMasks(N1); | |||
| 50411 | std::tie(Bits1, Elts1) = GetDemandedMasks(N0); | |||
| 50412 | ||||
| 50413 | if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) || | |||
| 50414 | TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) || | |||
| 50415 | TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) || | |||
| 50416 | TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) { | |||
| 50417 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 50418 | DCI.AddToWorklist(N); | |||
| 50419 | return SDValue(N, 0); | |||
| 50420 | } | |||
| 50421 | ||||
| 50422 | SDValue NewN0 = TLI.SimplifyMultipleUseDemandedBits(N0, Bits0, Elts0, DAG); | |||
| 50423 | SDValue NewN1 = TLI.SimplifyMultipleUseDemandedBits(N1, Bits1, Elts1, DAG); | |||
| 50424 | if (NewN0 || NewN1) | |||
| 50425 | return DAG.getNode(ISD::AND, dl, VT, NewN0 ? NewN0 : N0, | |||
| 50426 | NewN1 ? NewN1 : N1); | |||
| 50427 | } | |||
| 50428 | ||||
| 50429 | // Attempt to combine a scalar bitmask AND with an extracted shuffle. | |||
| 50430 | if ((VT.getScalarSizeInBits() % 8) == 0 && | |||
| 50431 | N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 50432 | isa<ConstantSDNode>(N0.getOperand(1))) { | |||
| 50433 | SDValue BitMask = N1; | |||
| 50434 | SDValue SrcVec = N0.getOperand(0); | |||
| 50435 | EVT SrcVecVT = SrcVec.getValueType(); | |||
| 50436 | ||||
| 50437 | // Check that the constant bitmask masks whole bytes. | |||
| 50438 | APInt UndefElts; | |||
| 50439 | SmallVector<APInt, 64> EltBits; | |||
| 50440 | if (VT == SrcVecVT.getScalarType() && N0->isOnlyUserOf(SrcVec.getNode()) && | |||
| 50441 | getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && | |||
| 50442 | llvm::all_of(EltBits, [](const APInt &M) { | |||
| 50443 | return M.isZero() || M.isAllOnes(); | |||
| 50444 | })) { | |||
| 50445 | unsigned NumElts = SrcVecVT.getVectorNumElements(); | |||
| 50446 | unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; | |||
| 50447 | unsigned Idx = N0.getConstantOperandVal(1); | |||
| 50448 | ||||
| 50449 | // Create a root shuffle mask from the byte mask and the extracted index. | |||
| 50450 | SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef); | |||
| 50451 | for (unsigned i = 0; i != Scale; ++i) { | |||
| 50452 | if (UndefElts[i]) | |||
| 50453 | continue; | |||
| 50454 | int VecIdx = Scale * Idx + i; | |||
| 50455 | ShuffleMask[VecIdx] = EltBits[i].isZero() ? SM_SentinelZero : VecIdx; | |||
| 50456 | } | |||
| 50457 | ||||
| 50458 | if (SDValue Shuffle = combineX86ShufflesRecursively( | |||
| 50459 | {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, | |||
| 50460 | X86::MaxShuffleCombineDepth, | |||
| 50461 | /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, | |||
| 50462 | /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) | |||
| 50463 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, | |||
| 50464 | N0.getOperand(1)); | |||
| 50465 | } | |||
| 50466 | } | |||
| 50467 | ||||
| 50468 | if (SDValue R = combineBMILogicOp(N, DAG, Subtarget)) | |||
| 50469 | return R; | |||
| 50470 | ||||
| 50471 | return SDValue(); | |||
| 50472 | } | |||
| 50473 | ||||
| 50474 | // Canonicalize OR(AND(X,C),AND(Y,~C)) -> OR(AND(X,C),ANDNP(C,Y)) | |||
| 50475 | static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG, | |||
| 50476 | const X86Subtarget &Subtarget) { | |||
| 50477 | assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR && "Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50477, __extension__ __PRETTY_FUNCTION__)); | |||
| 50478 | ||||
| 50479 | MVT VT = N->getSimpleValueType(0); | |||
| 50480 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 50481 | if (!VT.isVector() || (EltSizeInBits % 8) != 0) | |||
| 50482 | return SDValue(); | |||
| 50483 | ||||
| 50484 | SDValue N0 = peekThroughBitcasts(N->getOperand(0)); | |||
| 50485 | SDValue N1 = peekThroughBitcasts(N->getOperand(1)); | |||
| 50486 | if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND) | |||
| 50487 | return SDValue(); | |||
| 50488 | ||||
| 50489 | // On XOP we'll lower to PCMOV so accept one use. With AVX512, we can use | |||
| 50490 | // VPTERNLOG. Otherwise only do this if either mask has multiple uses already. | |||
| 50491 | if (!(Subtarget.hasXOP() || useVPTERNLOG(Subtarget, VT) || | |||
| 50492 | !N0.getOperand(1).hasOneUse() || !N1.getOperand(1).hasOneUse())) | |||
| 50493 | return SDValue(); | |||
| 50494 | ||||
| 50495 | // Attempt to extract constant byte masks. | |||
| 50496 | APInt UndefElts0, UndefElts1; | |||
| 50497 | SmallVector<APInt, 32> EltBits0, EltBits1; | |||
| 50498 | if (!getTargetConstantBitsFromNode(N0.getOperand(1), 8, UndefElts0, EltBits0, | |||
| 50499 | false, false)) | |||
| 50500 | return SDValue(); | |||
| 50501 | if (!getTargetConstantBitsFromNode(N1.getOperand(1), 8, UndefElts1, EltBits1, | |||
| 50502 | false, false)) | |||
| 50503 | return SDValue(); | |||
| 50504 | ||||
| 50505 | for (unsigned i = 0, e = EltBits0.size(); i != e; ++i) { | |||
| 50506 | // TODO - add UNDEF elts support. | |||
| 50507 | if (UndefElts0[i] || UndefElts1[i]) | |||
| 50508 | return SDValue(); | |||
| 50509 | if (EltBits0[i] != ~EltBits1[i]) | |||
| 50510 | return SDValue(); | |||
| 50511 | } | |||
| 50512 | ||||
| 50513 | SDLoc DL(N); | |||
| 50514 | ||||
| 50515 | if (useVPTERNLOG(Subtarget, VT)) { | |||
| 50516 | // Emit a VPTERNLOG node directly - 0xCA is the imm code for A?B:C. | |||
| 50517 | // VPTERNLOG is only available as vXi32/64-bit types. | |||
| 50518 | MVT OpSVT = EltSizeInBits == 32 ? MVT::i32 : MVT::i64; | |||
| 50519 | MVT OpVT = | |||
| 50520 | MVT::getVectorVT(OpSVT, VT.getSizeInBits() / OpSVT.getSizeInBits()); | |||
| 50521 | SDValue A = DAG.getBitcast(OpVT, N0.getOperand(1)); | |||
| 50522 | SDValue B = DAG.getBitcast(OpVT, N0.getOperand(0)); | |||
| 50523 | SDValue C = DAG.getBitcast(OpVT, N1.getOperand(0)); | |||
| 50524 | SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8); | |||
| 50525 | SDValue Res = getAVX512Node(X86ISD::VPTERNLOG, DL, OpVT, {A, B, C, Imm}, | |||
| 50526 | DAG, Subtarget); | |||
| 50527 | return DAG.getBitcast(VT, Res); | |||
| 50528 | } | |||
| 50529 | ||||
| 50530 | SDValue X = N->getOperand(0); | |||
| 50531 | SDValue Y = | |||
| 50532 | DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)), | |||
| 50533 | DAG.getBitcast(VT, N1.getOperand(0))); | |||
| 50534 | return DAG.getNode(ISD::OR, DL, VT, X, Y); | |||
| 50535 | } | |||
| 50536 | ||||
| 50537 | // Try to match OR(AND(~MASK,X),AND(MASK,Y)) logic pattern. | |||
| 50538 | static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) { | |||
| 50539 | if (N->getOpcode() != ISD::OR) | |||
| 50540 | return false; | |||
| 50541 | ||||
| 50542 | SDValue N0 = N->getOperand(0); | |||
| 50543 | SDValue N1 = N->getOperand(1); | |||
| 50544 | ||||
| 50545 | // Canonicalize AND to LHS. | |||
| 50546 | if (N1.getOpcode() == ISD::AND) | |||
| 50547 | std::swap(N0, N1); | |||
| 50548 | ||||
| 50549 | // Attempt to match OR(AND(M,Y),ANDNP(M,X)). | |||
| 50550 | if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) | |||
| 50551 | return false; | |||
| 50552 | ||||
| 50553 | Mask = N1.getOperand(0); | |||
| 50554 | X = N1.getOperand(1); | |||
| 50555 | ||||
| 50556 | // Check to see if the mask appeared in both the AND and ANDNP. | |||
| 50557 | if (N0.getOperand(0) == Mask) | |||
| 50558 | Y = N0.getOperand(1); | |||
| 50559 | else if (N0.getOperand(1) == Mask) | |||
| 50560 | Y = N0.getOperand(0); | |||
| 50561 | else | |||
| 50562 | return false; | |||
| 50563 | ||||
| 50564 | // TODO: Attempt to match against AND(XOR(-1,M),Y) as well, waiting for | |||
| 50565 | // ANDNP combine allows other combines to happen that prevent matching. | |||
| 50566 | return true; | |||
| 50567 | } | |||
| 50568 | ||||
| 50569 | // Try to fold: | |||
| 50570 | // (or (and (m, y), (pandn m, x))) | |||
| 50571 | // into: | |||
| 50572 | // (vselect m, x, y) | |||
| 50573 | // As a special case, try to fold: | |||
| 50574 | // (or (and (m, (sub 0, x)), (pandn m, x))) | |||
| 50575 | // into: | |||
| 50576 | // (sub (xor X, M), M) | |||
| 50577 | static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, | |||
| 50578 | const X86Subtarget &Subtarget) { | |||
| 50579 | assert(N->getOpcode() == ISD::OR && "Unexpected Opcode")(static_cast <bool> (N->getOpcode() == ISD::OR && "Unexpected Opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::OR && \"Unexpected Opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50579, __extension__ __PRETTY_FUNCTION__)); | |||
| 50580 | ||||
| 50581 | EVT VT = N->getValueType(0); | |||
| 50582 | if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || | |||
| 50583 | (VT.is256BitVector() && Subtarget.hasInt256()))) | |||
| 50584 | return SDValue(); | |||
| 50585 | ||||
| 50586 | SDValue X, Y, Mask; | |||
| 50587 | if (!matchLogicBlend(N, X, Y, Mask)) | |||
| 50588 | return SDValue(); | |||
| 50589 | ||||
| 50590 | // Validate that X, Y, and Mask are bitcasts, and see through them. | |||
| 50591 | Mask = peekThroughBitcasts(Mask); | |||
| 50592 | X = peekThroughBitcasts(X); | |||
| 50593 | Y = peekThroughBitcasts(Y); | |||
| 50594 | ||||
| 50595 | EVT MaskVT = Mask.getValueType(); | |||
| 50596 | unsigned EltBits = MaskVT.getScalarSizeInBits(); | |||
| 50597 | ||||
| 50598 | // TODO: Attempt to handle floating point cases as well? | |||
| 50599 | if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) | |||
| 50600 | return SDValue(); | |||
| 50601 | ||||
| 50602 | SDLoc DL(N); | |||
| 50603 | ||||
| 50604 | // Attempt to combine to conditional negate: (sub (xor X, M), M) | |||
| 50605 | if (SDValue Res = combineLogicBlendIntoConditionalNegate(VT, Mask, X, Y, DL, | |||
| 50606 | DAG, Subtarget)) | |||
| 50607 | return Res; | |||
| 50608 | ||||
| 50609 | // PBLENDVB is only available on SSE 4.1. | |||
| 50610 | if (!Subtarget.hasSSE41()) | |||
| 50611 | return SDValue(); | |||
| 50612 | ||||
| 50613 | // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops. | |||
| 50614 | if (Subtarget.hasVLX()) | |||
| 50615 | return SDValue(); | |||
| 50616 | ||||
| 50617 | MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8; | |||
| 50618 | ||||
| 50619 | X = DAG.getBitcast(BlendVT, X); | |||
| 50620 | Y = DAG.getBitcast(BlendVT, Y); | |||
| 50621 | Mask = DAG.getBitcast(BlendVT, Mask); | |||
| 50622 | Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); | |||
| 50623 | return DAG.getBitcast(VT, Mask); | |||
| 50624 | } | |||
| 50625 | ||||
| 50626 | // Helper function for combineOrCmpEqZeroToCtlzSrl | |||
| 50627 | // Transforms: | |||
| 50628 | // seteq(cmp x, 0) | |||
| 50629 | // into: | |||
| 50630 | // srl(ctlz x), log2(bitsize(x)) | |||
| 50631 | // Input pattern is checked by caller. | |||
| 50632 | static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, SelectionDAG &DAG) { | |||
| 50633 | SDValue Cmp = Op.getOperand(1); | |||
| 50634 | EVT VT = Cmp.getOperand(0).getValueType(); | |||
| 50635 | unsigned Log2b = Log2_32(VT.getSizeInBits()); | |||
| 50636 | SDLoc dl(Op); | |||
| 50637 | SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0)); | |||
| 50638 | // The result of the shift is true or false, and on X86, the 32-bit | |||
| 50639 | // encoding of shr and lzcnt is more desirable. | |||
| 50640 | SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32); | |||
| 50641 | SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc, | |||
| 50642 | DAG.getConstant(Log2b, dl, MVT::i8)); | |||
| 50643 | return Scc; | |||
| 50644 | } | |||
| 50645 | ||||
| 50646 | // Try to transform: | |||
| 50647 | // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0)))) | |||
| 50648 | // into: | |||
| 50649 | // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x)) | |||
| 50650 | // Will also attempt to match more generic cases, eg: | |||
| 50651 | // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0))) | |||
| 50652 | // Only applies if the target supports the FastLZCNT feature. | |||
| 50653 | static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, | |||
| 50654 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 50655 | const X86Subtarget &Subtarget) { | |||
| 50656 | if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast()) | |||
| 50657 | return SDValue(); | |||
| 50658 | ||||
| 50659 | auto isORCandidate = [](SDValue N) { | |||
| 50660 | return (N->getOpcode() == ISD::OR && N->hasOneUse()); | |||
| 50661 | }; | |||
| 50662 | ||||
| 50663 | // Check the zero extend is extending to 32-bit or more. The code generated by | |||
| 50664 | // srl(ctlz) for 16-bit or less variants of the pattern would require extra | |||
| 50665 | // instructions to clear the upper bits. | |||
| 50666 | if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) || | |||
| 50667 | !isORCandidate(N->getOperand(0))) | |||
| 50668 | return SDValue(); | |||
| 50669 | ||||
| 50670 | // Check the node matches: setcc(eq, cmp 0) | |||
| 50671 | auto isSetCCCandidate = [](SDValue N) { | |||
| 50672 | return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() && | |||
| 50673 | X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E && | |||
| 50674 | N->getOperand(1).getOpcode() == X86ISD::CMP && | |||
| 50675 | isNullConstant(N->getOperand(1).getOperand(1)) && | |||
| 50676 | N->getOperand(1).getValueType().bitsGE(MVT::i32); | |||
| 50677 | }; | |||
| 50678 | ||||
| 50679 | SDNode *OR = N->getOperand(0).getNode(); | |||
| 50680 | SDValue LHS = OR->getOperand(0); | |||
| 50681 | SDValue RHS = OR->getOperand(1); | |||
| 50682 | ||||
| 50683 | // Save nodes matching or(or, setcc(eq, cmp 0)). | |||
| 50684 | SmallVector<SDNode *, 2> ORNodes; | |||
| 50685 | while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) || | |||
| 50686 | (isORCandidate(RHS) && isSetCCCandidate(LHS)))) { | |||
| 50687 | ORNodes.push_back(OR); | |||
| 50688 | OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode(); | |||
| 50689 | LHS = OR->getOperand(0); | |||
| 50690 | RHS = OR->getOperand(1); | |||
| 50691 | } | |||
| 50692 | ||||
| 50693 | // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)). | |||
| 50694 | if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) || | |||
| 50695 | !isORCandidate(SDValue(OR, 0))) | |||
| 50696 | return SDValue(); | |||
| 50697 | ||||
| 50698 | // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it | |||
| 50699 | // to | |||
| 50700 | // or(srl(ctlz),srl(ctlz)). | |||
| 50701 | // The dag combiner can then fold it into: | |||
| 50702 | // srl(or(ctlz, ctlz)). | |||
| 50703 | SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, DAG); | |||
| 50704 | SDValue Ret, NewRHS; | |||
| 50705 | if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG))) | |||
| 50706 | Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, NewLHS, NewRHS); | |||
| 50707 | ||||
| 50708 | if (!Ret) | |||
| 50709 | return SDValue(); | |||
| 50710 | ||||
| 50711 | // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern. | |||
| 50712 | while (!ORNodes.empty()) { | |||
| 50713 | OR = ORNodes.pop_back_val(); | |||
| 50714 | LHS = OR->getOperand(0); | |||
| 50715 | RHS = OR->getOperand(1); | |||
| 50716 | // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or). | |||
| 50717 | if (RHS->getOpcode() == ISD::OR) | |||
| 50718 | std::swap(LHS, RHS); | |||
| 50719 | NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, DAG); | |||
| 50720 | if (!NewRHS) | |||
| 50721 | return SDValue(); | |||
| 50722 | Ret = DAG.getNode(ISD::OR, SDLoc(OR), MVT::i32, Ret, NewRHS); | |||
| 50723 | } | |||
| 50724 | ||||
| 50725 | return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); | |||
| 50726 | } | |||
| 50727 | ||||
| 50728 | static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, | |||
| 50729 | SDValue And1_L, SDValue And1_R, | |||
| 50730 | const SDLoc &DL, SelectionDAG &DAG) { | |||
| 50731 | if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) | |||
| 50732 | return SDValue(); | |||
| 50733 | SDValue NotOp = And0_L->getOperand(0); | |||
| 50734 | if (NotOp == And1_R) | |||
| 50735 | std::swap(And1_R, And1_L); | |||
| 50736 | if (NotOp != And1_L) | |||
| 50737 | return SDValue(); | |||
| 50738 | ||||
| 50739 | // (~(NotOp) & And0_R) | (NotOp & And1_R) | |||
| 50740 | // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R | |||
| 50741 | EVT VT = And1_L->getValueType(0); | |||
| 50742 | SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); | |||
| 50743 | SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); | |||
| 50744 | SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); | |||
| 50745 | SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); | |||
| 50746 | return Xor1; | |||
| 50747 | } | |||
| 50748 | ||||
| 50749 | /// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the | |||
| 50750 | /// equivalent `((x ^ y) & m) ^ y)` pattern. | |||
| 50751 | /// This is typically a better representation for targets without a fused | |||
| 50752 | /// "and-not" operation. This function is intended to be called from a | |||
| 50753 | /// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. | |||
| 50754 | static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { | |||
| 50755 | // Note that masked-merge variants using XOR or ADD expressions are | |||
| 50756 | // normalized to OR by InstCombine so we only check for OR. | |||
| 50757 | assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node")(static_cast <bool> (Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node") ? void (0) : __assert_fail ("Node->getOpcode() == ISD::OR && \"Must be called with ISD::OR node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50757, __extension__ __PRETTY_FUNCTION__)); | |||
| 50758 | SDValue N0 = Node->getOperand(0); | |||
| 50759 | if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) | |||
| 50760 | return SDValue(); | |||
| 50761 | SDValue N1 = Node->getOperand(1); | |||
| 50762 | if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) | |||
| 50763 | return SDValue(); | |||
| 50764 | ||||
| 50765 | SDLoc DL(Node); | |||
| 50766 | SDValue N00 = N0->getOperand(0); | |||
| 50767 | SDValue N01 = N0->getOperand(1); | |||
| 50768 | SDValue N10 = N1->getOperand(0); | |||
| 50769 | SDValue N11 = N1->getOperand(1); | |||
| 50770 | if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) | |||
| 50771 | return Result; | |||
| 50772 | if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) | |||
| 50773 | return Result; | |||
| 50774 | if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) | |||
| 50775 | return Result; | |||
| 50776 | if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) | |||
| 50777 | return Result; | |||
| 50778 | return SDValue(); | |||
| 50779 | } | |||
| 50780 | ||||
| 50781 | /// If this is an add or subtract where one operand is produced by a cmp+setcc, | |||
| 50782 | /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} | |||
| 50783 | /// with CMP+{ADC, SBB}. | |||
| 50784 | /// Also try (ADD/SUB)+(AND(SRL,1)) bit extraction pattern with BT+{ADC, SBB}. | |||
| 50785 | static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT, | |||
| 50786 | SDValue X, SDValue Y, | |||
| 50787 | SelectionDAG &DAG, | |||
| 50788 | bool ZeroSecondOpOnly = false) { | |||
| 50789 | if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) | |||
| 50790 | return SDValue(); | |||
| 50791 | ||||
| 50792 | // Look through a one-use zext. | |||
| 50793 | if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) | |||
| 50794 | Y = Y.getOperand(0); | |||
| 50795 | ||||
| 50796 | X86::CondCode CC; | |||
| 50797 | SDValue EFLAGS; | |||
| 50798 | if (Y.getOpcode() == X86ISD::SETCC && Y.hasOneUse()) { | |||
| 50799 | CC = (X86::CondCode)Y.getConstantOperandVal(0); | |||
| 50800 | EFLAGS = Y.getOperand(1); | |||
| 50801 | } else if (Y.getOpcode() == ISD::AND && isOneConstant(Y.getOperand(1)) && | |||
| 50802 | Y.hasOneUse()) { | |||
| 50803 | EFLAGS = LowerAndToBT(Y, ISD::SETNE, DL, DAG, CC); | |||
| 50804 | } | |||
| 50805 | ||||
| 50806 | if (!EFLAGS) | |||
| 50807 | return SDValue(); | |||
| 50808 | ||||
| 50809 | // If X is -1 or 0, then we have an opportunity to avoid constants required in | |||
| 50810 | // the general case below. | |||
| 50811 | auto *ConstantX = dyn_cast<ConstantSDNode>(X); | |||
| 50812 | if (ConstantX && !ZeroSecondOpOnly) { | |||
| 50813 | if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnes()) || | |||
| 50814 | (IsSub && CC == X86::COND_B && ConstantX->isZero())) { | |||
| 50815 | // This is a complicated way to get -1 or 0 from the carry flag: | |||
| 50816 | // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax | |||
| 50817 | // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax | |||
| 50818 | return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 50819 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), | |||
| 50820 | EFLAGS); | |||
| 50821 | } | |||
| 50822 | ||||
| 50823 | if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnes()) || | |||
| 50824 | (IsSub && CC == X86::COND_A && ConstantX->isZero())) { | |||
| 50825 | if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && | |||
| 50826 | EFLAGS.getValueType().isInteger() && | |||
| 50827 | !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { | |||
| 50828 | // Swap the operands of a SUB, and we have the same pattern as above. | |||
| 50829 | // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB | |||
| 50830 | // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB | |||
| 50831 | SDValue NewSub = DAG.getNode( | |||
| 50832 | X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), | |||
| 50833 | EFLAGS.getOperand(1), EFLAGS.getOperand(0)); | |||
| 50834 | SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); | |||
| 50835 | return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 50836 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), | |||
| 50837 | NewEFLAGS); | |||
| 50838 | } | |||
| 50839 | } | |||
| 50840 | } | |||
| 50841 | ||||
| 50842 | if (CC == X86::COND_B) { | |||
| 50843 | // X + SETB Z --> adc X, 0 | |||
| 50844 | // X - SETB Z --> sbb X, 0 | |||
| 50845 | return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, | |||
| 50846 | DAG.getVTList(VT, MVT::i32), X, | |||
| 50847 | DAG.getConstant(0, DL, VT), EFLAGS); | |||
| 50848 | } | |||
| 50849 | ||||
| 50850 | if (ZeroSecondOpOnly) | |||
| 50851 | return SDValue(); | |||
| 50852 | ||||
| 50853 | if (CC == X86::COND_A) { | |||
| 50854 | // Try to convert COND_A into COND_B in an attempt to facilitate | |||
| 50855 | // materializing "setb reg". | |||
| 50856 | // | |||
| 50857 | // Do not flip "e > c", where "c" is a constant, because Cmp instruction | |||
| 50858 | // cannot take an immediate as its first operand. | |||
| 50859 | // | |||
| 50860 | if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && | |||
| 50861 | EFLAGS.getValueType().isInteger() && | |||
| 50862 | !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { | |||
| 50863 | SDValue NewSub = | |||
| 50864 | DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), | |||
| 50865 | EFLAGS.getOperand(1), EFLAGS.getOperand(0)); | |||
| 50866 | SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); | |||
| 50867 | return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, | |||
| 50868 | DAG.getVTList(VT, MVT::i32), X, | |||
| 50869 | DAG.getConstant(0, DL, VT), NewEFLAGS); | |||
| 50870 | } | |||
| 50871 | } | |||
| 50872 | ||||
| 50873 | if (CC == X86::COND_AE) { | |||
| 50874 | // X + SETAE --> sbb X, -1 | |||
| 50875 | // X - SETAE --> adc X, -1 | |||
| 50876 | return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, | |||
| 50877 | DAG.getVTList(VT, MVT::i32), X, | |||
| 50878 | DAG.getConstant(-1, DL, VT), EFLAGS); | |||
| 50879 | } | |||
| 50880 | ||||
| 50881 | if (CC == X86::COND_BE) { | |||
| 50882 | // X + SETBE --> sbb X, -1 | |||
| 50883 | // X - SETBE --> adc X, -1 | |||
| 50884 | // Try to convert COND_BE into COND_AE in an attempt to facilitate | |||
| 50885 | // materializing "setae reg". | |||
| 50886 | // | |||
| 50887 | // Do not flip "e <= c", where "c" is a constant, because Cmp instruction | |||
| 50888 | // cannot take an immediate as its first operand. | |||
| 50889 | // | |||
| 50890 | if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() && | |||
| 50891 | EFLAGS.getValueType().isInteger() && | |||
| 50892 | !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { | |||
| 50893 | SDValue NewSub = | |||
| 50894 | DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), | |||
| 50895 | EFLAGS.getOperand(1), EFLAGS.getOperand(0)); | |||
| 50896 | SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo()); | |||
| 50897 | return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, | |||
| 50898 | DAG.getVTList(VT, MVT::i32), X, | |||
| 50899 | DAG.getConstant(-1, DL, VT), NewEFLAGS); | |||
| 50900 | } | |||
| 50901 | } | |||
| 50902 | ||||
| 50903 | if (CC != X86::COND_E && CC != X86::COND_NE) | |||
| 50904 | return SDValue(); | |||
| 50905 | ||||
| 50906 | if (EFLAGS.getOpcode() != X86ISD::CMP || !EFLAGS.hasOneUse() || | |||
| 50907 | !X86::isZeroNode(EFLAGS.getOperand(1)) || | |||
| 50908 | !EFLAGS.getOperand(0).getValueType().isInteger()) | |||
| 50909 | return SDValue(); | |||
| 50910 | ||||
| 50911 | SDValue Z = EFLAGS.getOperand(0); | |||
| 50912 | EVT ZVT = Z.getValueType(); | |||
| 50913 | ||||
| 50914 | // If X is -1 or 0, then we have an opportunity to avoid constants required in | |||
| 50915 | // the general case below. | |||
| 50916 | if (ConstantX) { | |||
| 50917 | // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with | |||
| 50918 | // fake operands: | |||
| 50919 | // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) | |||
| 50920 | // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z) | |||
| 50921 | if ((IsSub && CC == X86::COND_NE && ConstantX->isZero()) || | |||
| 50922 | (!IsSub && CC == X86::COND_E && ConstantX->isAllOnes())) { | |||
| 50923 | SDValue Zero = DAG.getConstant(0, DL, ZVT); | |||
| 50924 | SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); | |||
| 50925 | SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z); | |||
| 50926 | return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 50927 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), | |||
| 50928 | SDValue(Neg.getNode(), 1)); | |||
| 50929 | } | |||
| 50930 | ||||
| 50931 | // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb' | |||
| 50932 | // with fake operands: | |||
| 50933 | // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1) | |||
| 50934 | // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1) | |||
| 50935 | if ((IsSub && CC == X86::COND_E && ConstantX->isZero()) || | |||
| 50936 | (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnes())) { | |||
| 50937 | SDValue One = DAG.getConstant(1, DL, ZVT); | |||
| 50938 | SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); | |||
| 50939 | SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); | |||
| 50940 | return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 50941 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), | |||
| 50942 | Cmp1.getValue(1)); | |||
| 50943 | } | |||
| 50944 | } | |||
| 50945 | ||||
| 50946 | // (cmp Z, 1) sets the carry flag if Z is 0. | |||
| 50947 | SDValue One = DAG.getConstant(1, DL, ZVT); | |||
| 50948 | SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32); | |||
| 50949 | SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One); | |||
| 50950 | ||||
| 50951 | // Add the flags type for ADC/SBB nodes. | |||
| 50952 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 50953 | ||||
| 50954 | // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1) | |||
| 50955 | // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1) | |||
| 50956 | if (CC == X86::COND_NE) | |||
| 50957 | return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X, | |||
| 50958 | DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1)); | |||
| 50959 | ||||
| 50960 | // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1) | |||
| 50961 | // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1) | |||
| 50962 | return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X, | |||
| 50963 | DAG.getConstant(0, DL, VT), Cmp1.getValue(1)); | |||
| 50964 | } | |||
| 50965 | ||||
| 50966 | /// If this is an add or subtract where one operand is produced by a cmp+setcc, | |||
| 50967 | /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} | |||
| 50968 | /// with CMP+{ADC, SBB}. | |||
| 50969 | static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { | |||
| 50970 | bool IsSub = N->getOpcode() == ISD::SUB; | |||
| 50971 | SDValue X = N->getOperand(0); | |||
| 50972 | SDValue Y = N->getOperand(1); | |||
| 50973 | EVT VT = N->getValueType(0); | |||
| 50974 | SDLoc DL(N); | |||
| 50975 | ||||
| 50976 | if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG)) | |||
| 50977 | return ADCOrSBB; | |||
| 50978 | ||||
| 50979 | // Commute and try again (negate the result for subtracts). | |||
| 50980 | if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, Y, X, DAG)) { | |||
| 50981 | if (IsSub) | |||
| 50982 | ADCOrSBB = | |||
| 50983 | DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), ADCOrSBB); | |||
| 50984 | return ADCOrSBB; | |||
| 50985 | } | |||
| 50986 | ||||
| 50987 | return SDValue(); | |||
| 50988 | } | |||
| 50989 | ||||
| 50990 | static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1, | |||
| 50991 | SelectionDAG &DAG) { | |||
| 50992 | assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N ->getOpcode() == ISD::OR) && "Unexpected opcode") ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__ __PRETTY_FUNCTION__)) | |||
| 50993 | "Unexpected opcode")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N ->getOpcode() == ISD::OR) && "Unexpected opcode") ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::OR) && \"Unexpected opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 50993, __extension__ __PRETTY_FUNCTION__)); | |||
| 50994 | ||||
| 50995 | // Delegate to combineAddOrSubToADCOrSBB if we have: | |||
| 50996 | // | |||
| 50997 | // (xor/or (zero_extend (setcc)) imm) | |||
| 50998 | // | |||
| 50999 | // where imm is odd if and only if we have xor, in which case the XOR/OR are | |||
| 51000 | // equivalent to a SUB/ADD, respectively. | |||
| 51001 | if (N0.getOpcode() == ISD::ZERO_EXTEND && | |||
| 51002 | N0.getOperand(0).getOpcode() == X86ISD::SETCC && N0.hasOneUse()) { | |||
| 51003 | if (auto *N1C = dyn_cast<ConstantSDNode>(N1)) { | |||
| 51004 | bool IsSub = N->getOpcode() == ISD::XOR; | |||
| 51005 | bool N1COdd = N1C->getZExtValue() & 1; | |||
| 51006 | if (IsSub ? N1COdd : !N1COdd) { | |||
| 51007 | SDLoc DL(N); | |||
| 51008 | EVT VT = N->getValueType(0); | |||
| 51009 | if (SDValue R = combineAddOrSubToADCOrSBB(IsSub, DL, VT, N1, N0, DAG)) | |||
| 51010 | return R; | |||
| 51011 | } | |||
| 51012 | } | |||
| 51013 | } | |||
| 51014 | ||||
| 51015 | return SDValue(); | |||
| 51016 | } | |||
| 51017 | ||||
| 51018 | static SDValue combineOr(SDNode *N, SelectionDAG &DAG, | |||
| 51019 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 51020 | const X86Subtarget &Subtarget) { | |||
| 51021 | SDValue N0 = N->getOperand(0); | |||
| 51022 | SDValue N1 = N->getOperand(1); | |||
| 51023 | EVT VT = N->getValueType(0); | |||
| 51024 | SDLoc dl(N); | |||
| 51025 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51026 | ||||
| 51027 | // If this is SSE1 only convert to FOR to avoid scalarization. | |||
| 51028 | if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { | |||
| 51029 | return DAG.getBitcast(MVT::v4i32, | |||
| 51030 | DAG.getNode(X86ISD::FOR, dl, MVT::v4f32, | |||
| 51031 | DAG.getBitcast(MVT::v4f32, N0), | |||
| 51032 | DAG.getBitcast(MVT::v4f32, N1))); | |||
| 51033 | } | |||
| 51034 | ||||
| 51035 | // Match any-of bool scalar reductions into a bitcast/movmsk + cmp. | |||
| 51036 | // TODO: Support multiple SrcOps. | |||
| 51037 | if (VT == MVT::i1) { | |||
| 51038 | SmallVector<SDValue, 2> SrcOps; | |||
| 51039 | SmallVector<APInt, 2> SrcPartials; | |||
| 51040 | if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) && | |||
| 51041 | SrcOps.size() == 1) { | |||
| 51042 | unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements(); | |||
| 51043 | EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); | |||
| 51044 | SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget); | |||
| 51045 | if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType())) | |||
| 51046 | Mask = DAG.getBitcast(MaskVT, SrcOps[0]); | |||
| 51047 | if (Mask) { | |||
| 51048 | assert(SrcPartials[0].getBitWidth() == NumElts &&(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask") ? void (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__ __PRETTY_FUNCTION__)) | |||
| 51049 | "Unexpected partial reduction mask")(static_cast <bool> (SrcPartials[0].getBitWidth() == NumElts && "Unexpected partial reduction mask") ? void (0) : __assert_fail ("SrcPartials[0].getBitWidth() == NumElts && \"Unexpected partial reduction mask\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51049, __extension__ __PRETTY_FUNCTION__)); | |||
| 51050 | SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT); | |||
| 51051 | SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT); | |||
| 51052 | Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits); | |||
| 51053 | return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE); | |||
| 51054 | } | |||
| 51055 | } | |||
| 51056 | } | |||
| 51057 | ||||
| 51058 | if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) | |||
| 51059 | return R; | |||
| 51060 | ||||
| 51061 | if (SDValue R = combineBitOpWithShift(N, DAG)) | |||
| 51062 | return R; | |||
| 51063 | ||||
| 51064 | if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) | |||
| 51065 | return FPLogic; | |||
| 51066 | ||||
| 51067 | if (DCI.isBeforeLegalizeOps()) | |||
| 51068 | return SDValue(); | |||
| 51069 | ||||
| 51070 | if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget)) | |||
| 51071 | return R; | |||
| 51072 | ||||
| 51073 | if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget)) | |||
| 51074 | return R; | |||
| 51075 | ||||
| 51076 | if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) | |||
| 51077 | return R; | |||
| 51078 | ||||
| 51079 | // (0 - SetCC) | C -> (zext (not SetCC)) * (C + 1) - 1 if we can get a LEA out of it. | |||
| 51080 | if ((VT == MVT::i32 || VT == MVT::i64) && | |||
| 51081 | N0.getOpcode() == ISD::SUB && N0.hasOneUse() && | |||
| 51082 | isNullConstant(N0.getOperand(0))) { | |||
| 51083 | SDValue Cond = N0.getOperand(1); | |||
| 51084 | if (Cond.getOpcode() == ISD::ZERO_EXTEND && Cond.hasOneUse()) | |||
| 51085 | Cond = Cond.getOperand(0); | |||
| 51086 | ||||
| 51087 | if (Cond.getOpcode() == X86ISD::SETCC && Cond.hasOneUse()) { | |||
| 51088 | if (auto *CN = dyn_cast<ConstantSDNode>(N1)) { | |||
| 51089 | uint64_t Val = CN->getZExtValue(); | |||
| 51090 | if (Val == 1 || Val == 2 || Val == 3 || Val == 4 || Val == 7 || Val == 8) { | |||
| 51091 | X86::CondCode CCode = (X86::CondCode)Cond.getConstantOperandVal(0); | |||
| 51092 | CCode = X86::GetOppositeBranchCondition(CCode); | |||
| 51093 | SDValue NotCond = getSETCC(CCode, Cond.getOperand(1), SDLoc(Cond), DAG); | |||
| 51094 | ||||
| 51095 | SDValue R = DAG.getZExtOrTrunc(NotCond, dl, VT); | |||
| 51096 | R = DAG.getNode(ISD::MUL, dl, VT, R, DAG.getConstant(Val + 1, dl, VT)); | |||
| 51097 | R = DAG.getNode(ISD::SUB, dl, VT, R, DAG.getConstant(1, dl, VT)); | |||
| 51098 | return R; | |||
| 51099 | } | |||
| 51100 | } | |||
| 51101 | } | |||
| 51102 | } | |||
| 51103 | ||||
| 51104 | // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y). | |||
| 51105 | // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X). | |||
| 51106 | // iff the upper elements of the non-shifted arg are zero. | |||
| 51107 | // KUNPCK require 16+ bool vector elements. | |||
| 51108 | if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) { | |||
| 51109 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 51110 | unsigned HalfElts = NumElts / 2; | |||
| 51111 | APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts); | |||
| 51112 | if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL && | |||
| 51113 | N1.getConstantOperandAPInt(1) == HalfElts && | |||
| 51114 | DAG.MaskedVectorIsZero(N0, UpperElts)) { | |||
| 51115 | return DAG.getNode( | |||
| 51116 | ISD::CONCAT_VECTORS, dl, VT, | |||
| 51117 | extractSubVector(N0, 0, DAG, dl, HalfElts), | |||
| 51118 | extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts)); | |||
| 51119 | } | |||
| 51120 | if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL && | |||
| 51121 | N0.getConstantOperandAPInt(1) == HalfElts && | |||
| 51122 | DAG.MaskedVectorIsZero(N1, UpperElts)) { | |||
| 51123 | return DAG.getNode( | |||
| 51124 | ISD::CONCAT_VECTORS, dl, VT, | |||
| 51125 | extractSubVector(N1, 0, DAG, dl, HalfElts), | |||
| 51126 | extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts)); | |||
| 51127 | } | |||
| 51128 | } | |||
| 51129 | ||||
| 51130 | if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { | |||
| 51131 | // Attempt to recursively combine an OR of shuffles. | |||
| 51132 | SDValue Op(N, 0); | |||
| 51133 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 51134 | return Res; | |||
| 51135 | ||||
| 51136 | // If either operand is a constant mask, then only the elements that aren't | |||
| 51137 | // allones are actually demanded by the other operand. | |||
| 51138 | auto SimplifyUndemandedElts = [&](SDValue Op, SDValue OtherOp) { | |||
| 51139 | APInt UndefElts; | |||
| 51140 | SmallVector<APInt> EltBits; | |||
| 51141 | int NumElts = VT.getVectorNumElements(); | |||
| 51142 | int EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 51143 | if (!getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, EltBits)) | |||
| 51144 | return false; | |||
| 51145 | ||||
| 51146 | APInt DemandedElts = APInt::getZero(NumElts); | |||
| 51147 | for (int I = 0; I != NumElts; ++I) | |||
| 51148 | if (!EltBits[I].isAllOnes()) | |||
| 51149 | DemandedElts.setBit(I); | |||
| 51150 | ||||
| 51151 | return TLI.SimplifyDemandedVectorElts(OtherOp, DemandedElts, DCI); | |||
| 51152 | }; | |||
| 51153 | if (SimplifyUndemandedElts(N0, N1) || SimplifyUndemandedElts(N1, N0)) { | |||
| 51154 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 51155 | DCI.AddToWorklist(N); | |||
| 51156 | return SDValue(N, 0); | |||
| 51157 | } | |||
| 51158 | } | |||
| 51159 | ||||
| 51160 | // We should fold "masked merge" patterns when `andn` is not available. | |||
| 51161 | if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) | |||
| 51162 | if (SDValue R = foldMaskedMerge(N, DAG)) | |||
| 51163 | return R; | |||
| 51164 | ||||
| 51165 | if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG)) | |||
| 51166 | return R; | |||
| 51167 | ||||
| 51168 | return SDValue(); | |||
| 51169 | } | |||
| 51170 | ||||
| 51171 | /// Try to turn tests against the signbit in the form of: | |||
| 51172 | /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) | |||
| 51173 | /// into: | |||
| 51174 | /// SETGT(X, -1) | |||
| 51175 | static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) { | |||
| 51176 | // This is only worth doing if the output type is i8 or i1. | |||
| 51177 | EVT ResultType = N->getValueType(0); | |||
| 51178 | if (ResultType != MVT::i8 && ResultType != MVT::i1) | |||
| 51179 | return SDValue(); | |||
| 51180 | ||||
| 51181 | SDValue N0 = N->getOperand(0); | |||
| 51182 | SDValue N1 = N->getOperand(1); | |||
| 51183 | ||||
| 51184 | // We should be performing an xor against a truncated shift. | |||
| 51185 | if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse()) | |||
| 51186 | return SDValue(); | |||
| 51187 | ||||
| 51188 | // Make sure we are performing an xor against one. | |||
| 51189 | if (!isOneConstant(N1)) | |||
| 51190 | return SDValue(); | |||
| 51191 | ||||
| 51192 | // SetCC on x86 zero extends so only act on this if it's a logical shift. | |||
| 51193 | SDValue Shift = N0.getOperand(0); | |||
| 51194 | if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse()) | |||
| 51195 | return SDValue(); | |||
| 51196 | ||||
| 51197 | // Make sure we are truncating from one of i16, i32 or i64. | |||
| 51198 | EVT ShiftTy = Shift.getValueType(); | |||
| 51199 | if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64) | |||
| 51200 | return SDValue(); | |||
| 51201 | ||||
| 51202 | // Make sure the shift amount extracts the sign bit. | |||
| 51203 | if (!isa<ConstantSDNode>(Shift.getOperand(1)) || | |||
| 51204 | Shift.getConstantOperandAPInt(1) != (ShiftTy.getSizeInBits() - 1)) | |||
| 51205 | return SDValue(); | |||
| 51206 | ||||
| 51207 | // Create a greater-than comparison against -1. | |||
| 51208 | // N.B. Using SETGE against 0 works but we want a canonical looking | |||
| 51209 | // comparison, using SETGT matches up with what TranslateX86CC. | |||
| 51210 | SDLoc DL(N); | |||
| 51211 | SDValue ShiftOp = Shift.getOperand(0); | |||
| 51212 | EVT ShiftOpTy = ShiftOp.getValueType(); | |||
| 51213 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51214 | EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(), | |||
| 51215 | *DAG.getContext(), ResultType); | |||
| 51216 | SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp, | |||
| 51217 | DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT); | |||
| 51218 | if (SetCCResultType != ResultType) | |||
| 51219 | Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond); | |||
| 51220 | return Cond; | |||
| 51221 | } | |||
| 51222 | ||||
| 51223 | /// Turn vector tests of the signbit in the form of: | |||
| 51224 | /// xor (sra X, elt_size(X)-1), -1 | |||
| 51225 | /// into: | |||
| 51226 | /// pcmpgt X, -1 | |||
| 51227 | /// | |||
| 51228 | /// This should be called before type legalization because the pattern may not | |||
| 51229 | /// persist after that. | |||
| 51230 | static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG, | |||
| 51231 | const X86Subtarget &Subtarget) { | |||
| 51232 | EVT VT = N->getValueType(0); | |||
| 51233 | if (!VT.isSimple()) | |||
| 51234 | return SDValue(); | |||
| 51235 | ||||
| 51236 | switch (VT.getSimpleVT().SimpleTy) { | |||
| 51237 | default: return SDValue(); | |||
| 51238 | case MVT::v16i8: | |||
| 51239 | case MVT::v8i16: | |||
| 51240 | case MVT::v4i32: | |||
| 51241 | case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break; | |||
| 51242 | case MVT::v32i8: | |||
| 51243 | case MVT::v16i16: | |||
| 51244 | case MVT::v8i32: | |||
| 51245 | case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break; | |||
| 51246 | } | |||
| 51247 | ||||
| 51248 | // There must be a shift right algebraic before the xor, and the xor must be a | |||
| 51249 | // 'not' operation. | |||
| 51250 | SDValue Shift = N->getOperand(0); | |||
| 51251 | SDValue Ones = N->getOperand(1); | |||
| 51252 | if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() || | |||
| 51253 | !ISD::isBuildVectorAllOnes(Ones.getNode())) | |||
| 51254 | return SDValue(); | |||
| 51255 | ||||
| 51256 | // The shift should be smearing the sign bit across each vector element. | |||
| 51257 | auto *ShiftAmt = | |||
| 51258 | isConstOrConstSplat(Shift.getOperand(1), /*AllowUndefs*/ true); | |||
| 51259 | if (!ShiftAmt || | |||
| 51260 | ShiftAmt->getAPIntValue() != (Shift.getScalarValueSizeInBits() - 1)) | |||
| 51261 | return SDValue(); | |||
| 51262 | ||||
| 51263 | // Create a greater-than comparison against -1. We don't use the more obvious | |||
| 51264 | // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction. | |||
| 51265 | return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT); | |||
| 51266 | } | |||
| 51267 | ||||
| 51268 | /// Detect patterns of truncation with unsigned saturation: | |||
| 51269 | /// | |||
| 51270 | /// 1. (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type). | |||
| 51271 | /// Return the source value x to be truncated or SDValue() if the pattern was | |||
| 51272 | /// not matched. | |||
| 51273 | /// | |||
| 51274 | /// 2. (truncate (smin (smax (x, C1), C2)) to dest_type), | |||
| 51275 | /// where C1 >= 0 and C2 is unsigned max of destination type. | |||
| 51276 | /// | |||
| 51277 | /// (truncate (smax (smin (x, C2), C1)) to dest_type) | |||
| 51278 | /// where C1 >= 0, C2 is unsigned max of destination type and C1 <= C2. | |||
| 51279 | /// | |||
| 51280 | /// These two patterns are equivalent to: | |||
| 51281 | /// (truncate (umin (smax(x, C1), unsigned_max_of_dest_type)) to dest_type) | |||
| 51282 | /// So return the smax(x, C1) value to be truncated or SDValue() if the | |||
| 51283 | /// pattern was not matched. | |||
| 51284 | static SDValue detectUSatPattern(SDValue In, EVT VT, SelectionDAG &DAG, | |||
| 51285 | const SDLoc &DL) { | |||
| 51286 | EVT InVT = In.getValueType(); | |||
| 51287 | ||||
| 51288 | // Saturation with truncation. We truncate from InVT to VT. | |||
| 51289 | assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&(static_cast <bool> (InVT.getScalarSizeInBits() > VT .getScalarSizeInBits() && "Unexpected types for truncate operation" ) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__ __PRETTY_FUNCTION__)) | |||
| 51290 | "Unexpected types for truncate operation")(static_cast <bool> (InVT.getScalarSizeInBits() > VT .getScalarSizeInBits() && "Unexpected types for truncate operation" ) ? void (0) : __assert_fail ("InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() && \"Unexpected types for truncate operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51290, __extension__ __PRETTY_FUNCTION__)); | |||
| 51291 | ||||
| 51292 | // Match min/max and return limit value as a parameter. | |||
| 51293 | auto MatchMinMax = [](SDValue V, unsigned Opcode, APInt &Limit) -> SDValue { | |||
| 51294 | if (V.getOpcode() == Opcode && | |||
| 51295 | ISD::isConstantSplatVector(V.getOperand(1).getNode(), Limit)) | |||
| 51296 | return V.getOperand(0); | |||
| 51297 | return SDValue(); | |||
| 51298 | }; | |||
| 51299 | ||||
| 51300 | APInt C1, C2; | |||
| 51301 | if (SDValue UMin = MatchMinMax(In, ISD::UMIN, C2)) | |||
| 51302 | // C2 should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according | |||
| 51303 | // the element size of the destination type. | |||
| 51304 | if (C2.isMask(VT.getScalarSizeInBits())) | |||
| 51305 | return UMin; | |||
| 51306 | ||||
| 51307 | if (SDValue SMin = MatchMinMax(In, ISD::SMIN, C2)) | |||
| 51308 | if (MatchMinMax(SMin, ISD::SMAX, C1)) | |||
| 51309 | if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits())) | |||
| 51310 | return SMin; | |||
| 51311 | ||||
| 51312 | if (SDValue SMax = MatchMinMax(In, ISD::SMAX, C1)) | |||
| 51313 | if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, C2)) | |||
| 51314 | if (C1.isNonNegative() && C2.isMask(VT.getScalarSizeInBits()) && | |||
| 51315 | C2.uge(C1)) { | |||
| 51316 | return DAG.getNode(ISD::SMAX, DL, InVT, SMin, In.getOperand(1)); | |||
| 51317 | } | |||
| 51318 | ||||
| 51319 | return SDValue(); | |||
| 51320 | } | |||
| 51321 | ||||
| 51322 | /// Detect patterns of truncation with signed saturation: | |||
| 51323 | /// (truncate (smin ((smax (x, signed_min_of_dest_type)), | |||
| 51324 | /// signed_max_of_dest_type)) to dest_type) | |||
| 51325 | /// or: | |||
| 51326 | /// (truncate (smax ((smin (x, signed_max_of_dest_type)), | |||
| 51327 | /// signed_min_of_dest_type)) to dest_type). | |||
| 51328 | /// With MatchPackUS, the smax/smin range is [0, unsigned_max_of_dest_type]. | |||
| 51329 | /// Return the source value to be truncated or SDValue() if the pattern was not | |||
| 51330 | /// matched. | |||
| 51331 | static SDValue detectSSatPattern(SDValue In, EVT VT, bool MatchPackUS = false) { | |||
| 51332 | unsigned NumDstBits = VT.getScalarSizeInBits(); | |||
| 51333 | unsigned NumSrcBits = In.getScalarValueSizeInBits(); | |||
| 51334 | assert(NumSrcBits > NumDstBits && "Unexpected types for truncate operation")(static_cast <bool> (NumSrcBits > NumDstBits && "Unexpected types for truncate operation") ? void (0) : __assert_fail ("NumSrcBits > NumDstBits && \"Unexpected types for truncate operation\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51334, __extension__ __PRETTY_FUNCTION__)); | |||
| 51335 | ||||
| 51336 | auto MatchMinMax = [](SDValue V, unsigned Opcode, | |||
| 51337 | const APInt &Limit) -> SDValue { | |||
| 51338 | APInt C; | |||
| 51339 | if (V.getOpcode() == Opcode && | |||
| 51340 | ISD::isConstantSplatVector(V.getOperand(1).getNode(), C) && C == Limit) | |||
| 51341 | return V.getOperand(0); | |||
| 51342 | return SDValue(); | |||
| 51343 | }; | |||
| 51344 | ||||
| 51345 | APInt SignedMax, SignedMin; | |||
| 51346 | if (MatchPackUS) { | |||
| 51347 | SignedMax = APInt::getAllOnes(NumDstBits).zext(NumSrcBits); | |||
| 51348 | SignedMin = APInt(NumSrcBits, 0); | |||
| 51349 | } else { | |||
| 51350 | SignedMax = APInt::getSignedMaxValue(NumDstBits).sext(NumSrcBits); | |||
| 51351 | SignedMin = APInt::getSignedMinValue(NumDstBits).sext(NumSrcBits); | |||
| 51352 | } | |||
| 51353 | ||||
| 51354 | if (SDValue SMin = MatchMinMax(In, ISD::SMIN, SignedMax)) | |||
| 51355 | if (SDValue SMax = MatchMinMax(SMin, ISD::SMAX, SignedMin)) | |||
| 51356 | return SMax; | |||
| 51357 | ||||
| 51358 | if (SDValue SMax = MatchMinMax(In, ISD::SMAX, SignedMin)) | |||
| 51359 | if (SDValue SMin = MatchMinMax(SMax, ISD::SMIN, SignedMax)) | |||
| 51360 | return SMin; | |||
| 51361 | ||||
| 51362 | return SDValue(); | |||
| 51363 | } | |||
| 51364 | ||||
| 51365 | static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL, | |||
| 51366 | SelectionDAG &DAG, | |||
| 51367 | const X86Subtarget &Subtarget) { | |||
| 51368 | if (!Subtarget.hasSSE2() || !VT.isVector()) | |||
| 51369 | return SDValue(); | |||
| 51370 | ||||
| 51371 | EVT SVT = VT.getVectorElementType(); | |||
| 51372 | EVT InVT = In.getValueType(); | |||
| 51373 | EVT InSVT = InVT.getVectorElementType(); | |||
| 51374 | ||||
| 51375 | // If we're clamping a signed 32-bit vector to 0-255 and the 32-bit vector is | |||
| 51376 | // split across two registers. We can use a packusdw+perm to clamp to 0-65535 | |||
| 51377 | // and concatenate at the same time. Then we can use a final vpmovuswb to | |||
| 51378 | // clip to 0-255. | |||
| 51379 | if (Subtarget.hasBWI() && !Subtarget.useAVX512Regs() && | |||
| 51380 | InVT == MVT::v16i32 && VT == MVT::v16i8) { | |||
| 51381 | if (SDValue USatVal = detectSSatPattern(In, VT, true)) { | |||
| 51382 | // Emit a VPACKUSDW+VPERMQ followed by a VPMOVUSWB. | |||
| 51383 | SDValue Mid = truncateVectorWithPACK(X86ISD::PACKUS, MVT::v16i16, USatVal, | |||
| 51384 | DL, DAG, Subtarget); | |||
| 51385 | assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ? void (0) : __assert_fail ("Mid && \"Failed to pack!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51385, __extension__ __PRETTY_FUNCTION__)); | |||
| 51386 | return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Mid); | |||
| 51387 | } | |||
| 51388 | } | |||
| 51389 | ||||
| 51390 | // vXi32 truncate instructions are available with AVX512F. | |||
| 51391 | // vXi16 truncate instructions are only available with AVX512BW. | |||
| 51392 | // For 256-bit or smaller vectors, we require VLX. | |||
| 51393 | // FIXME: We could widen truncates to 512 to remove the VLX restriction. | |||
| 51394 | // If the result type is 256-bits or larger and we have disable 512-bit | |||
| 51395 | // registers, we should go ahead and use the pack instructions if possible. | |||
| 51396 | bool PreferAVX512 = ((Subtarget.hasAVX512() && InSVT == MVT::i32) || | |||
| 51397 | (Subtarget.hasBWI() && InSVT == MVT::i16)) && | |||
| 51398 | (InVT.getSizeInBits() > 128) && | |||
| 51399 | (Subtarget.hasVLX() || InVT.getSizeInBits() > 256) && | |||
| 51400 | !(!Subtarget.useAVX512Regs() && VT.getSizeInBits() >= 256); | |||
| 51401 | ||||
| 51402 | if (isPowerOf2_32(VT.getVectorNumElements()) && !PreferAVX512 && | |||
| 51403 | VT.getSizeInBits() >= 64 && | |||
| 51404 | (SVT == MVT::i8 || SVT == MVT::i16) && | |||
| 51405 | (InSVT == MVT::i16 || InSVT == MVT::i32)) { | |||
| 51406 | if (SDValue USatVal = detectSSatPattern(In, VT, true)) { | |||
| 51407 | // vXi32 -> vXi8 must be performed as PACKUSWB(PACKSSDW,PACKSSDW). | |||
| 51408 | // Only do this when the result is at least 64 bits or we'll leaving | |||
| 51409 | // dangling PACKSSDW nodes. | |||
| 51410 | if (SVT == MVT::i8 && InSVT == MVT::i32) { | |||
| 51411 | EVT MidVT = VT.changeVectorElementType(MVT::i16); | |||
| 51412 | SDValue Mid = truncateVectorWithPACK(X86ISD::PACKSS, MidVT, USatVal, DL, | |||
| 51413 | DAG, Subtarget); | |||
| 51414 | assert(Mid && "Failed to pack!")(static_cast <bool> (Mid && "Failed to pack!") ? void (0) : __assert_fail ("Mid && \"Failed to pack!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51414, __extension__ __PRETTY_FUNCTION__)); | |||
| 51415 | SDValue V = truncateVectorWithPACK(X86ISD::PACKUS, VT, Mid, DL, DAG, | |||
| 51416 | Subtarget); | |||
| 51417 | assert(V && "Failed to pack!")(static_cast <bool> (V && "Failed to pack!") ? void (0) : __assert_fail ("V && \"Failed to pack!\"", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 51417, __extension__ __PRETTY_FUNCTION__)); | |||
| 51418 | return V; | |||
| 51419 | } else if (SVT == MVT::i8 || Subtarget.hasSSE41()) | |||
| 51420 | return truncateVectorWithPACK(X86ISD::PACKUS, VT, USatVal, DL, DAG, | |||
| 51421 | Subtarget); | |||
| 51422 | } | |||
| 51423 | if (SDValue SSatVal = detectSSatPattern(In, VT)) | |||
| 51424 | return truncateVectorWithPACK(X86ISD::PACKSS, VT, SSatVal, DL, DAG, | |||
| 51425 | Subtarget); | |||
| 51426 | } | |||
| 51427 | ||||
| 51428 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51429 | if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 && | |||
| 51430 | Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI()) && | |||
| 51431 | (SVT == MVT::i32 || SVT == MVT::i16 || SVT == MVT::i8)) { | |||
| 51432 | unsigned TruncOpc = 0; | |||
| 51433 | SDValue SatVal; | |||
| 51434 | if (SDValue SSatVal = detectSSatPattern(In, VT)) { | |||
| 51435 | SatVal = SSatVal; | |||
| 51436 | TruncOpc = X86ISD::VTRUNCS; | |||
| 51437 | } else if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) { | |||
| 51438 | SatVal = USatVal; | |||
| 51439 | TruncOpc = X86ISD::VTRUNCUS; | |||
| 51440 | } | |||
| 51441 | if (SatVal) { | |||
| 51442 | unsigned ResElts = VT.getVectorNumElements(); | |||
| 51443 | // If the input type is less than 512 bits and we don't have VLX, we need | |||
| 51444 | // to widen to 512 bits. | |||
| 51445 | if (!Subtarget.hasVLX() && !InVT.is512BitVector()) { | |||
| 51446 | unsigned NumConcats = 512 / InVT.getSizeInBits(); | |||
| 51447 | ResElts *= NumConcats; | |||
| 51448 | SmallVector<SDValue, 4> ConcatOps(NumConcats, DAG.getUNDEF(InVT)); | |||
| 51449 | ConcatOps[0] = SatVal; | |||
| 51450 | InVT = EVT::getVectorVT(*DAG.getContext(), InSVT, | |||
| 51451 | NumConcats * InVT.getVectorNumElements()); | |||
| 51452 | SatVal = DAG.getNode(ISD::CONCAT_VECTORS, DL, InVT, ConcatOps); | |||
| 51453 | } | |||
| 51454 | // Widen the result if its narrower than 128 bits. | |||
| 51455 | if (ResElts * SVT.getSizeInBits() < 128) | |||
| 51456 | ResElts = 128 / SVT.getSizeInBits(); | |||
| 51457 | EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), SVT, ResElts); | |||
| 51458 | SDValue Res = DAG.getNode(TruncOpc, DL, TruncVT, SatVal); | |||
| 51459 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 51460 | DAG.getIntPtrConstant(0, DL)); | |||
| 51461 | } | |||
| 51462 | } | |||
| 51463 | ||||
| 51464 | return SDValue(); | |||
| 51465 | } | |||
| 51466 | ||||
| 51467 | /// This function detects the AVG pattern between vectors of unsigned i8/i16, | |||
| 51468 | /// which is c = (a + b + 1) / 2, and replace this operation with the efficient | |||
| 51469 | /// ISD::AVGCEILU (AVG) instruction. | |||
| 51470 | static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG, | |||
| 51471 | const X86Subtarget &Subtarget, | |||
| 51472 | const SDLoc &DL) { | |||
| 51473 | if (!VT.isVector()) | |||
| 51474 | return SDValue(); | |||
| 51475 | EVT InVT = In.getValueType(); | |||
| 51476 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 51477 | ||||
| 51478 | EVT ScalarVT = VT.getVectorElementType(); | |||
| 51479 | if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) && NumElems >= 2)) | |||
| 51480 | return SDValue(); | |||
| 51481 | ||||
| 51482 | // InScalarVT is the intermediate type in AVG pattern and it should be greater | |||
| 51483 | // than the original input type (i8/i16). | |||
| 51484 | EVT InScalarVT = InVT.getVectorElementType(); | |||
| 51485 | if (InScalarVT.getFixedSizeInBits() <= ScalarVT.getFixedSizeInBits()) | |||
| 51486 | return SDValue(); | |||
| 51487 | ||||
| 51488 | if (!Subtarget.hasSSE2()) | |||
| 51489 | return SDValue(); | |||
| 51490 | ||||
| 51491 | // Detect the following pattern: | |||
| 51492 | // | |||
| 51493 | // %1 = zext <N x i8> %a to <N x i32> | |||
| 51494 | // %2 = zext <N x i8> %b to <N x i32> | |||
| 51495 | // %3 = add nuw nsw <N x i32> %1, <i32 1 x N> | |||
| 51496 | // %4 = add nuw nsw <N x i32> %3, %2 | |||
| 51497 | // %5 = lshr <N x i32> %N, <i32 1 x N> | |||
| 51498 | // %6 = trunc <N x i32> %5 to <N x i8> | |||
| 51499 | // | |||
| 51500 | // In AVX512, the last instruction can also be a trunc store. | |||
| 51501 | if (In.getOpcode() != ISD::SRL) | |||
| 51502 | return SDValue(); | |||
| 51503 | ||||
| 51504 | // A lambda checking the given SDValue is a constant vector and each element | |||
| 51505 | // is in the range [Min, Max]. | |||
| 51506 | auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) { | |||
| 51507 | return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) { | |||
| 51508 | return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max)); | |||
| 51509 | }); | |||
| 51510 | }; | |||
| 51511 | ||||
| 51512 | auto IsZExtLike = [DAG = &DAG, ScalarVT](SDValue V) { | |||
| 51513 | unsigned MaxActiveBits = DAG->computeKnownBits(V).countMaxActiveBits(); | |||
| 51514 | return MaxActiveBits <= ScalarVT.getSizeInBits(); | |||
| 51515 | }; | |||
| 51516 | ||||
| 51517 | // Check if each element of the vector is right-shifted by one. | |||
| 51518 | SDValue LHS = In.getOperand(0); | |||
| 51519 | SDValue RHS = In.getOperand(1); | |||
| 51520 | if (!IsConstVectorInRange(RHS, 1, 1)) | |||
| 51521 | return SDValue(); | |||
| 51522 | if (LHS.getOpcode() != ISD::ADD) | |||
| 51523 | return SDValue(); | |||
| 51524 | ||||
| 51525 | // Detect a pattern of a + b + 1 where the order doesn't matter. | |||
| 51526 | SDValue Operands[3]; | |||
| 51527 | Operands[0] = LHS.getOperand(0); | |||
| 51528 | Operands[1] = LHS.getOperand(1); | |||
| 51529 | ||||
| 51530 | auto AVGBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 51531 | ArrayRef<SDValue> Ops) { | |||
| 51532 | return DAG.getNode(ISD::AVGCEILU, DL, Ops[0].getValueType(), Ops); | |||
| 51533 | }; | |||
| 51534 | ||||
| 51535 | auto AVGSplitter = [&](std::array<SDValue, 2> Ops) { | |||
| 51536 | for (SDValue &Op : Ops) | |||
| 51537 | if (Op.getValueType() != VT) | |||
| 51538 | Op = DAG.getNode(ISD::TRUNCATE, DL, VT, Op); | |||
| 51539 | // Pad to a power-of-2 vector, split+apply and extract the original vector. | |||
| 51540 | unsigned NumElemsPow2 = PowerOf2Ceil(NumElems); | |||
| 51541 | EVT Pow2VT = EVT::getVectorVT(*DAG.getContext(), ScalarVT, NumElemsPow2); | |||
| 51542 | if (NumElemsPow2 != NumElems) { | |||
| 51543 | for (SDValue &Op : Ops) { | |||
| 51544 | SmallVector<SDValue, 32> EltsOfOp(NumElemsPow2, DAG.getUNDEF(ScalarVT)); | |||
| 51545 | for (unsigned i = 0; i != NumElems; ++i) { | |||
| 51546 | SDValue Idx = DAG.getIntPtrConstant(i, DL); | |||
| 51547 | EltsOfOp[i] = | |||
| 51548 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, Op, Idx); | |||
| 51549 | } | |||
| 51550 | Op = DAG.getBuildVector(Pow2VT, DL, EltsOfOp); | |||
| 51551 | } | |||
| 51552 | } | |||
| 51553 | SDValue Res = SplitOpsAndApply(DAG, Subtarget, DL, Pow2VT, Ops, AVGBuilder); | |||
| 51554 | if (NumElemsPow2 == NumElems) | |||
| 51555 | return Res; | |||
| 51556 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, | |||
| 51557 | DAG.getIntPtrConstant(0, DL)); | |||
| 51558 | }; | |||
| 51559 | ||||
| 51560 | // Take care of the case when one of the operands is a constant vector whose | |||
| 51561 | // element is in the range [1, 256]. | |||
| 51562 | if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) && | |||
| 51563 | IsZExtLike(Operands[0])) { | |||
| 51564 | // The pattern is detected. Subtract one from the constant vector, then | |||
| 51565 | // demote it and emit X86ISD::AVG instruction. | |||
| 51566 | SDValue VecOnes = DAG.getConstant(1, DL, InVT); | |||
| 51567 | Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes); | |||
| 51568 | return AVGSplitter({Operands[0], Operands[1]}); | |||
| 51569 | } | |||
| 51570 | ||||
| 51571 | // Matches 'add like' patterns: add(Op0,Op1) + zext(or(Op0,Op1)). | |||
| 51572 | // Match the or case only if its 'add-like' - can be replaced by an add. | |||
| 51573 | auto FindAddLike = [&](SDValue V, SDValue &Op0, SDValue &Op1) { | |||
| 51574 | if (ISD::ADD == V.getOpcode()) { | |||
| 51575 | Op0 = V.getOperand(0); | |||
| 51576 | Op1 = V.getOperand(1); | |||
| 51577 | return true; | |||
| 51578 | } | |||
| 51579 | if (ISD::ZERO_EXTEND != V.getOpcode()) | |||
| 51580 | return false; | |||
| 51581 | V = V.getOperand(0); | |||
| 51582 | if (V.getValueType() != VT || ISD::OR != V.getOpcode() || | |||
| 51583 | !DAG.haveNoCommonBitsSet(V.getOperand(0), V.getOperand(1))) | |||
| 51584 | return false; | |||
| 51585 | Op0 = V.getOperand(0); | |||
| 51586 | Op1 = V.getOperand(1); | |||
| 51587 | return true; | |||
| 51588 | }; | |||
| 51589 | ||||
| 51590 | SDValue Op0, Op1; | |||
| 51591 | if (FindAddLike(Operands[0], Op0, Op1)) | |||
| 51592 | std::swap(Operands[0], Operands[1]); | |||
| 51593 | else if (!FindAddLike(Operands[1], Op0, Op1)) | |||
| 51594 | return SDValue(); | |||
| 51595 | Operands[2] = Op0; | |||
| 51596 | Operands[1] = Op1; | |||
| 51597 | ||||
| 51598 | // Now we have three operands of two additions. Check that one of them is a | |||
| 51599 | // constant vector with ones, and the other two can be promoted from i8/i16. | |||
| 51600 | for (SDValue &Op : Operands) { | |||
| 51601 | if (!IsConstVectorInRange(Op, 1, 1)) | |||
| 51602 | continue; | |||
| 51603 | std::swap(Op, Operands[2]); | |||
| 51604 | ||||
| 51605 | // Check if Operands[0] and Operands[1] are results of type promotion. | |||
| 51606 | for (int j = 0; j < 2; ++j) | |||
| 51607 | if (Operands[j].getValueType() != VT) | |||
| 51608 | if (!IsZExtLike(Operands[j])) | |||
| 51609 | return SDValue(); | |||
| 51610 | ||||
| 51611 | // The pattern is detected, emit X86ISD::AVG instruction(s). | |||
| 51612 | return AVGSplitter({Operands[0], Operands[1]}); | |||
| 51613 | } | |||
| 51614 | ||||
| 51615 | return SDValue(); | |||
| 51616 | } | |||
| 51617 | ||||
| 51618 | static SDValue combineLoad(SDNode *N, SelectionDAG &DAG, | |||
| 51619 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 51620 | const X86Subtarget &Subtarget) { | |||
| 51621 | LoadSDNode *Ld = cast<LoadSDNode>(N); | |||
| 51622 | EVT RegVT = Ld->getValueType(0); | |||
| 51623 | EVT MemVT = Ld->getMemoryVT(); | |||
| 51624 | SDLoc dl(Ld); | |||
| 51625 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51626 | ||||
| 51627 | // For chips with slow 32-byte unaligned loads, break the 32-byte operation | |||
| 51628 | // into two 16-byte operations. Also split non-temporal aligned loads on | |||
| 51629 | // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads. | |||
| 51630 | ISD::LoadExtType Ext = Ld->getExtensionType(); | |||
| 51631 | unsigned Fast; | |||
| 51632 | if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() && | |||
| 51633 | Ext == ISD::NON_EXTLOAD && | |||
| 51634 | ((Ld->isNonTemporal() && !Subtarget.hasInt256() && | |||
| 51635 | Ld->getAlign() >= Align(16)) || | |||
| 51636 | (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT, | |||
| 51637 | *Ld->getMemOperand(), &Fast) && | |||
| 51638 | !Fast))) { | |||
| 51639 | unsigned NumElems = RegVT.getVectorNumElements(); | |||
| 51640 | if (NumElems < 2) | |||
| 51641 | return SDValue(); | |||
| 51642 | ||||
| 51643 | unsigned HalfOffset = 16; | |||
| 51644 | SDValue Ptr1 = Ld->getBasePtr(); | |||
| 51645 | SDValue Ptr2 = | |||
| 51646 | DAG.getMemBasePlusOffset(Ptr1, TypeSize::Fixed(HalfOffset), dl); | |||
| 51647 | EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), | |||
| 51648 | NumElems / 2); | |||
| 51649 | SDValue Load1 = | |||
| 51650 | DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(), | |||
| 51651 | Ld->getOriginalAlign(), | |||
| 51652 | Ld->getMemOperand()->getFlags()); | |||
| 51653 | SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2, | |||
| 51654 | Ld->getPointerInfo().getWithOffset(HalfOffset), | |||
| 51655 | Ld->getOriginalAlign(), | |||
| 51656 | Ld->getMemOperand()->getFlags()); | |||
| 51657 | SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, | |||
| 51658 | Load1.getValue(1), Load2.getValue(1)); | |||
| 51659 | ||||
| 51660 | SDValue NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Load1, Load2); | |||
| 51661 | return DCI.CombineTo(N, NewVec, TF, true); | |||
| 51662 | } | |||
| 51663 | ||||
| 51664 | // Bool vector load - attempt to cast to an integer, as we have good | |||
| 51665 | // (vXiY *ext(vXi1 bitcast(iX))) handling. | |||
| 51666 | if (Ext == ISD::NON_EXTLOAD && !Subtarget.hasAVX512() && RegVT.isVector() && | |||
| 51667 | RegVT.getScalarType() == MVT::i1 && DCI.isBeforeLegalize()) { | |||
| 51668 | unsigned NumElts = RegVT.getVectorNumElements(); | |||
| 51669 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts); | |||
| 51670 | if (TLI.isTypeLegal(IntVT)) { | |||
| 51671 | SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(), | |||
| 51672 | Ld->getPointerInfo(), | |||
| 51673 | Ld->getOriginalAlign(), | |||
| 51674 | Ld->getMemOperand()->getFlags()); | |||
| 51675 | SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad); | |||
| 51676 | return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true); | |||
| 51677 | } | |||
| 51678 | } | |||
| 51679 | ||||
| 51680 | // If we also broadcast this as a subvector to a wider type, then just extract | |||
| 51681 | // the lowest subvector. | |||
| 51682 | if (Ext == ISD::NON_EXTLOAD && Subtarget.hasAVX() && Ld->isSimple() && | |||
| 51683 | (RegVT.is128BitVector() || RegVT.is256BitVector())) { | |||
| 51684 | SDValue Ptr = Ld->getBasePtr(); | |||
| 51685 | SDValue Chain = Ld->getChain(); | |||
| 51686 | for (SDNode *User : Ptr->uses()) { | |||
| 51687 | if (User != N && User->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && | |||
| 51688 | cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && | |||
| 51689 | cast<MemIntrinsicSDNode>(User)->getChain() == Chain && | |||
| 51690 | cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == | |||
| 51691 | MemVT.getSizeInBits() && | |||
| 51692 | !User->hasAnyUseOfValue(1) && | |||
| 51693 | User->getValueSizeInBits(0).getFixedValue() > | |||
| 51694 | RegVT.getFixedSizeInBits()) { | |||
| 51695 | SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), | |||
| 51696 | RegVT.getSizeInBits()); | |||
| 51697 | Extract = DAG.getBitcast(RegVT, Extract); | |||
| 51698 | return DCI.CombineTo(N, Extract, SDValue(User, 1)); | |||
| 51699 | } | |||
| 51700 | } | |||
| 51701 | } | |||
| 51702 | ||||
| 51703 | // Cast ptr32 and ptr64 pointers to the default address space before a load. | |||
| 51704 | unsigned AddrSpace = Ld->getAddressSpace(); | |||
| 51705 | if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || | |||
| 51706 | AddrSpace == X86AS::PTR32_UPTR) { | |||
| 51707 | MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 51708 | if (PtrVT != Ld->getBasePtr().getSimpleValueType()) { | |||
| 51709 | SDValue Cast = | |||
| 51710 | DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0); | |||
| 51711 | return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(), | |||
| 51712 | Ld->getOriginalAlign(), | |||
| 51713 | Ld->getMemOperand()->getFlags()); | |||
| 51714 | } | |||
| 51715 | } | |||
| 51716 | ||||
| 51717 | return SDValue(); | |||
| 51718 | } | |||
| 51719 | ||||
| 51720 | /// If V is a build vector of boolean constants and exactly one of those | |||
| 51721 | /// constants is true, return the operand index of that true element. | |||
| 51722 | /// Otherwise, return -1. | |||
| 51723 | static int getOneTrueElt(SDValue V) { | |||
| 51724 | // This needs to be a build vector of booleans. | |||
| 51725 | // TODO: Checking for the i1 type matches the IR definition for the mask, | |||
| 51726 | // but the mask check could be loosened to i8 or other types. That might | |||
| 51727 | // also require checking more than 'allOnesValue'; eg, the x86 HW | |||
| 51728 | // instructions only require that the MSB is set for each mask element. | |||
| 51729 | // The ISD::MSTORE comments/definition do not specify how the mask operand | |||
| 51730 | // is formatted. | |||
| 51731 | auto *BV = dyn_cast<BuildVectorSDNode>(V); | |||
| 51732 | if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1) | |||
| 51733 | return -1; | |||
| 51734 | ||||
| 51735 | int TrueIndex = -1; | |||
| 51736 | unsigned NumElts = BV->getValueType(0).getVectorNumElements(); | |||
| 51737 | for (unsigned i = 0; i < NumElts; ++i) { | |||
| 51738 | const SDValue &Op = BV->getOperand(i); | |||
| 51739 | if (Op.isUndef()) | |||
| 51740 | continue; | |||
| 51741 | auto *ConstNode = dyn_cast<ConstantSDNode>(Op); | |||
| 51742 | if (!ConstNode) | |||
| 51743 | return -1; | |||
| 51744 | if (ConstNode->getAPIntValue().countr_one() >= 1) { | |||
| 51745 | // If we already found a one, this is too many. | |||
| 51746 | if (TrueIndex >= 0) | |||
| 51747 | return -1; | |||
| 51748 | TrueIndex = i; | |||
| 51749 | } | |||
| 51750 | } | |||
| 51751 | return TrueIndex; | |||
| 51752 | } | |||
| 51753 | ||||
| 51754 | /// Given a masked memory load/store operation, return true if it has one mask | |||
| 51755 | /// bit set. If it has one mask bit set, then also return the memory address of | |||
| 51756 | /// the scalar element to load/store, the vector index to insert/extract that | |||
| 51757 | /// scalar element, and the alignment for the scalar memory access. | |||
| 51758 | static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp, | |||
| 51759 | SelectionDAG &DAG, SDValue &Addr, | |||
| 51760 | SDValue &Index, Align &Alignment, | |||
| 51761 | unsigned &Offset) { | |||
| 51762 | int TrueMaskElt = getOneTrueElt(MaskedOp->getMask()); | |||
| 51763 | if (TrueMaskElt < 0) | |||
| 51764 | return false; | |||
| 51765 | ||||
| 51766 | // Get the address of the one scalar element that is specified by the mask | |||
| 51767 | // using the appropriate offset from the base pointer. | |||
| 51768 | EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType(); | |||
| 51769 | Offset = 0; | |||
| 51770 | Addr = MaskedOp->getBasePtr(); | |||
| 51771 | if (TrueMaskElt != 0) { | |||
| 51772 | Offset = TrueMaskElt * EltVT.getStoreSize(); | |||
| 51773 | Addr = DAG.getMemBasePlusOffset(Addr, TypeSize::Fixed(Offset), | |||
| 51774 | SDLoc(MaskedOp)); | |||
| 51775 | } | |||
| 51776 | ||||
| 51777 | Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp)); | |||
| 51778 | Alignment = commonAlignment(MaskedOp->getOriginalAlign(), | |||
| 51779 | EltVT.getStoreSize()); | |||
| 51780 | return true; | |||
| 51781 | } | |||
| 51782 | ||||
| 51783 | /// If exactly one element of the mask is set for a non-extending masked load, | |||
| 51784 | /// it is a scalar load and vector insert. | |||
| 51785 | /// Note: It is expected that the degenerate cases of an all-zeros or all-ones | |||
| 51786 | /// mask have already been optimized in IR, so we don't bother with those here. | |||
| 51787 | static SDValue | |||
| 51788 | reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG, | |||
| 51789 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 51790 | const X86Subtarget &Subtarget) { | |||
| 51791 | assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!" ) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51791, __extension__ __PRETTY_FUNCTION__)); | |||
| 51792 | // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. | |||
| 51793 | // However, some target hooks may need to be added to know when the transform | |||
| 51794 | // is profitable. Endianness would also have to be considered. | |||
| 51795 | ||||
| 51796 | SDValue Addr, VecIndex; | |||
| 51797 | Align Alignment; | |||
| 51798 | unsigned Offset; | |||
| 51799 | if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment, Offset)) | |||
| 51800 | return SDValue(); | |||
| 51801 | ||||
| 51802 | // Load the one scalar element that is specified by the mask using the | |||
| 51803 | // appropriate offset from the base pointer. | |||
| 51804 | SDLoc DL(ML); | |||
| 51805 | EVT VT = ML->getValueType(0); | |||
| 51806 | EVT EltVT = VT.getVectorElementType(); | |||
| 51807 | ||||
| 51808 | EVT CastVT = VT; | |||
| 51809 | if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { | |||
| 51810 | EltVT = MVT::f64; | |||
| 51811 | CastVT = VT.changeVectorElementType(EltVT); | |||
| 51812 | } | |||
| 51813 | ||||
| 51814 | SDValue Load = | |||
| 51815 | DAG.getLoad(EltVT, DL, ML->getChain(), Addr, | |||
| 51816 | ML->getPointerInfo().getWithOffset(Offset), | |||
| 51817 | Alignment, ML->getMemOperand()->getFlags()); | |||
| 51818 | ||||
| 51819 | SDValue PassThru = DAG.getBitcast(CastVT, ML->getPassThru()); | |||
| 51820 | ||||
| 51821 | // Insert the loaded element into the appropriate place in the vector. | |||
| 51822 | SDValue Insert = | |||
| 51823 | DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, CastVT, PassThru, Load, VecIndex); | |||
| 51824 | Insert = DAG.getBitcast(VT, Insert); | |||
| 51825 | return DCI.CombineTo(ML, Insert, Load.getValue(1), true); | |||
| 51826 | } | |||
| 51827 | ||||
| 51828 | static SDValue | |||
| 51829 | combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG, | |||
| 51830 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 51831 | assert(ML->isUnindexed() && "Unexpected indexed masked load!")(static_cast <bool> (ML->isUnindexed() && "Unexpected indexed masked load!" ) ? void (0) : __assert_fail ("ML->isUnindexed() && \"Unexpected indexed masked load!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 51831, __extension__ __PRETTY_FUNCTION__)); | |||
| 51832 | if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode())) | |||
| 51833 | return SDValue(); | |||
| 51834 | ||||
| 51835 | SDLoc DL(ML); | |||
| 51836 | EVT VT = ML->getValueType(0); | |||
| 51837 | ||||
| 51838 | // If we are loading the first and last elements of a vector, it is safe and | |||
| 51839 | // always faster to load the whole vector. Replace the masked load with a | |||
| 51840 | // vector load and select. | |||
| 51841 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 51842 | BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask()); | |||
| 51843 | bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0)); | |||
| 51844 | bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1)); | |||
| 51845 | if (LoadFirstElt && LoadLastElt) { | |||
| 51846 | SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(), | |||
| 51847 | ML->getMemOperand()); | |||
| 51848 | SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, | |||
| 51849 | ML->getPassThru()); | |||
| 51850 | return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true); | |||
| 51851 | } | |||
| 51852 | ||||
| 51853 | // Convert a masked load with a constant mask into a masked load and a select. | |||
| 51854 | // This allows the select operation to use a faster kind of select instruction | |||
| 51855 | // (for example, vblendvps -> vblendps). | |||
| 51856 | ||||
| 51857 | // Don't try this if the pass-through operand is already undefined. That would | |||
| 51858 | // cause an infinite loop because that's what we're about to create. | |||
| 51859 | if (ML->getPassThru().isUndef()) | |||
| 51860 | return SDValue(); | |||
| 51861 | ||||
| 51862 | if (ISD::isBuildVectorAllZeros(ML->getPassThru().getNode())) | |||
| 51863 | return SDValue(); | |||
| 51864 | ||||
| 51865 | // The new masked load has an undef pass-through operand. The select uses the | |||
| 51866 | // original pass-through operand. | |||
| 51867 | SDValue NewML = DAG.getMaskedLoad( | |||
| 51868 | VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(), | |||
| 51869 | DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(), | |||
| 51870 | ML->getAddressingMode(), ML->getExtensionType()); | |||
| 51871 | SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, | |||
| 51872 | ML->getPassThru()); | |||
| 51873 | ||||
| 51874 | return DCI.CombineTo(ML, Blend, NewML.getValue(1), true); | |||
| 51875 | } | |||
| 51876 | ||||
| 51877 | static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, | |||
| 51878 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 51879 | const X86Subtarget &Subtarget) { | |||
| 51880 | auto *Mld = cast<MaskedLoadSDNode>(N); | |||
| 51881 | ||||
| 51882 | // TODO: Expanding load with constant mask may be optimized as well. | |||
| 51883 | if (Mld->isExpandingLoad()) | |||
| 51884 | return SDValue(); | |||
| 51885 | ||||
| 51886 | if (Mld->getExtensionType() == ISD::NON_EXTLOAD) { | |||
| 51887 | if (SDValue ScalarLoad = | |||
| 51888 | reduceMaskedLoadToScalarLoad(Mld, DAG, DCI, Subtarget)) | |||
| 51889 | return ScalarLoad; | |||
| 51890 | ||||
| 51891 | // TODO: Do some AVX512 subsets benefit from this transform? | |||
| 51892 | if (!Subtarget.hasAVX512()) | |||
| 51893 | if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI)) | |||
| 51894 | return Blend; | |||
| 51895 | } | |||
| 51896 | ||||
| 51897 | // If the mask value has been legalized to a non-boolean vector, try to | |||
| 51898 | // simplify ops leading up to it. We only demand the MSB of each lane. | |||
| 51899 | SDValue Mask = Mld->getMask(); | |||
| 51900 | if (Mask.getScalarValueSizeInBits() != 1) { | |||
| 51901 | EVT VT = Mld->getValueType(0); | |||
| 51902 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51903 | APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); | |||
| 51904 | if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { | |||
| 51905 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 51906 | DCI.AddToWorklist(N); | |||
| 51907 | return SDValue(N, 0); | |||
| 51908 | } | |||
| 51909 | if (SDValue NewMask = | |||
| 51910 | TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) | |||
| 51911 | return DAG.getMaskedLoad( | |||
| 51912 | VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(), | |||
| 51913 | NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(), | |||
| 51914 | Mld->getAddressingMode(), Mld->getExtensionType()); | |||
| 51915 | } | |||
| 51916 | ||||
| 51917 | return SDValue(); | |||
| 51918 | } | |||
| 51919 | ||||
| 51920 | /// If exactly one element of the mask is set for a non-truncating masked store, | |||
| 51921 | /// it is a vector extract and scalar store. | |||
| 51922 | /// Note: It is expected that the degenerate cases of an all-zeros or all-ones | |||
| 51923 | /// mask have already been optimized in IR, so we don't bother with those here. | |||
| 51924 | static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS, | |||
| 51925 | SelectionDAG &DAG, | |||
| 51926 | const X86Subtarget &Subtarget) { | |||
| 51927 | // TODO: This is not x86-specific, so it could be lifted to DAGCombiner. | |||
| 51928 | // However, some target hooks may need to be added to know when the transform | |||
| 51929 | // is profitable. Endianness would also have to be considered. | |||
| 51930 | ||||
| 51931 | SDValue Addr, VecIndex; | |||
| 51932 | Align Alignment; | |||
| 51933 | unsigned Offset; | |||
| 51934 | if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment, Offset)) | |||
| 51935 | return SDValue(); | |||
| 51936 | ||||
| 51937 | // Extract the one scalar element that is actually being stored. | |||
| 51938 | SDLoc DL(MS); | |||
| 51939 | SDValue Value = MS->getValue(); | |||
| 51940 | EVT VT = Value.getValueType(); | |||
| 51941 | EVT EltVT = VT.getVectorElementType(); | |||
| 51942 | if (EltVT == MVT::i64 && !Subtarget.is64Bit()) { | |||
| 51943 | EltVT = MVT::f64; | |||
| 51944 | EVT CastVT = VT.changeVectorElementType(EltVT); | |||
| 51945 | Value = DAG.getBitcast(CastVT, Value); | |||
| 51946 | } | |||
| 51947 | SDValue Extract = | |||
| 51948 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Value, VecIndex); | |||
| 51949 | ||||
| 51950 | // Store that element at the appropriate offset from the base pointer. | |||
| 51951 | return DAG.getStore(MS->getChain(), DL, Extract, Addr, | |||
| 51952 | MS->getPointerInfo().getWithOffset(Offset), | |||
| 51953 | Alignment, MS->getMemOperand()->getFlags()); | |||
| 51954 | } | |||
| 51955 | ||||
| 51956 | static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, | |||
| 51957 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 51958 | const X86Subtarget &Subtarget) { | |||
| 51959 | MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N); | |||
| 51960 | if (Mst->isCompressingStore()) | |||
| 51961 | return SDValue(); | |||
| 51962 | ||||
| 51963 | EVT VT = Mst->getValue().getValueType(); | |||
| 51964 | SDLoc dl(Mst); | |||
| 51965 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 51966 | ||||
| 51967 | if (Mst->isTruncatingStore()) | |||
| 51968 | return SDValue(); | |||
| 51969 | ||||
| 51970 | if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG, Subtarget)) | |||
| 51971 | return ScalarStore; | |||
| 51972 | ||||
| 51973 | // If the mask value has been legalized to a non-boolean vector, try to | |||
| 51974 | // simplify ops leading up to it. We only demand the MSB of each lane. | |||
| 51975 | SDValue Mask = Mst->getMask(); | |||
| 51976 | if (Mask.getScalarValueSizeInBits() != 1) { | |||
| 51977 | APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits())); | |||
| 51978 | if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) { | |||
| 51979 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 51980 | DCI.AddToWorklist(N); | |||
| 51981 | return SDValue(N, 0); | |||
| 51982 | } | |||
| 51983 | if (SDValue NewMask = | |||
| 51984 | TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG)) | |||
| 51985 | return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(), | |||
| 51986 | Mst->getBasePtr(), Mst->getOffset(), NewMask, | |||
| 51987 | Mst->getMemoryVT(), Mst->getMemOperand(), | |||
| 51988 | Mst->getAddressingMode()); | |||
| 51989 | } | |||
| 51990 | ||||
| 51991 | SDValue Value = Mst->getValue(); | |||
| 51992 | if (Value.getOpcode() == ISD::TRUNCATE && Value.getNode()->hasOneUse() && | |||
| 51993 | TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(), | |||
| 51994 | Mst->getMemoryVT())) { | |||
| 51995 | return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0), | |||
| 51996 | Mst->getBasePtr(), Mst->getOffset(), Mask, | |||
| 51997 | Mst->getMemoryVT(), Mst->getMemOperand(), | |||
| 51998 | Mst->getAddressingMode(), true); | |||
| 51999 | } | |||
| 52000 | ||||
| 52001 | return SDValue(); | |||
| 52002 | } | |||
| 52003 | ||||
| 52004 | static SDValue combineStore(SDNode *N, SelectionDAG &DAG, | |||
| 52005 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 52006 | const X86Subtarget &Subtarget) { | |||
| 52007 | StoreSDNode *St = cast<StoreSDNode>(N); | |||
| 52008 | EVT StVT = St->getMemoryVT(); | |||
| 52009 | SDLoc dl(St); | |||
| 52010 | SDValue StoredVal = St->getValue(); | |||
| 52011 | EVT VT = StoredVal.getValueType(); | |||
| 52012 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 52013 | ||||
| 52014 | // Convert a store of vXi1 into a store of iX and a bitcast. | |||
| 52015 | if (!Subtarget.hasAVX512() && VT == StVT && VT.isVector() && | |||
| 52016 | VT.getVectorElementType() == MVT::i1) { | |||
| 52017 | ||||
| 52018 | EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements()); | |||
| 52019 | StoredVal = DAG.getBitcast(NewVT, StoredVal); | |||
| 52020 | ||||
| 52021 | return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), | |||
| 52022 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52023 | St->getMemOperand()->getFlags()); | |||
| 52024 | } | |||
| 52025 | ||||
| 52026 | // If this is a store of a scalar_to_vector to v1i1, just use a scalar store. | |||
| 52027 | // This will avoid a copy to k-register. | |||
| 52028 | if (VT == MVT::v1i1 && VT == StVT && Subtarget.hasAVX512() && | |||
| 52029 | StoredVal.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 52030 | StoredVal.getOperand(0).getValueType() == MVT::i8) { | |||
| 52031 | SDValue Val = StoredVal.getOperand(0); | |||
| 52032 | // We must store zeros to the unused bits. | |||
| 52033 | Val = DAG.getZeroExtendInReg(Val, dl, MVT::i1); | |||
| 52034 | return DAG.getStore(St->getChain(), dl, Val, | |||
| 52035 | St->getBasePtr(), St->getPointerInfo(), | |||
| 52036 | St->getOriginalAlign(), | |||
| 52037 | St->getMemOperand()->getFlags()); | |||
| 52038 | } | |||
| 52039 | ||||
| 52040 | // Widen v2i1/v4i1 stores to v8i1. | |||
| 52041 | if ((VT == MVT::v1i1 || VT == MVT::v2i1 || VT == MVT::v4i1) && VT == StVT && | |||
| 52042 | Subtarget.hasAVX512()) { | |||
| 52043 | unsigned NumConcats = 8 / VT.getVectorNumElements(); | |||
| 52044 | // We must store zeros to the unused bits. | |||
| 52045 | SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, VT)); | |||
| 52046 | Ops[0] = StoredVal; | |||
| 52047 | StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops); | |||
| 52048 | return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), | |||
| 52049 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52050 | St->getMemOperand()->getFlags()); | |||
| 52051 | } | |||
| 52052 | ||||
| 52053 | // Turn vXi1 stores of constants into a scalar store. | |||
| 52054 | if ((VT == MVT::v8i1 || VT == MVT::v16i1 || VT == MVT::v32i1 || | |||
| 52055 | VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) && | |||
| 52056 | ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) { | |||
| 52057 | // If its a v64i1 store without 64-bit support, we need two stores. | |||
| 52058 | if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) { | |||
| 52059 | SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl, | |||
| 52060 | StoredVal->ops().slice(0, 32)); | |||
| 52061 | Lo = combinevXi1ConstantToInteger(Lo, DAG); | |||
| 52062 | SDValue Hi = DAG.getBuildVector(MVT::v32i1, dl, | |||
| 52063 | StoredVal->ops().slice(32, 32)); | |||
| 52064 | Hi = combinevXi1ConstantToInteger(Hi, DAG); | |||
| 52065 | ||||
| 52066 | SDValue Ptr0 = St->getBasePtr(); | |||
| 52067 | SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, TypeSize::Fixed(4), dl); | |||
| 52068 | ||||
| 52069 | SDValue Ch0 = | |||
| 52070 | DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(), | |||
| 52071 | St->getOriginalAlign(), | |||
| 52072 | St->getMemOperand()->getFlags()); | |||
| 52073 | SDValue Ch1 = | |||
| 52074 | DAG.getStore(St->getChain(), dl, Hi, Ptr1, | |||
| 52075 | St->getPointerInfo().getWithOffset(4), | |||
| 52076 | St->getOriginalAlign(), | |||
| 52077 | St->getMemOperand()->getFlags()); | |||
| 52078 | return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1); | |||
| 52079 | } | |||
| 52080 | ||||
| 52081 | StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG); | |||
| 52082 | return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(), | |||
| 52083 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52084 | St->getMemOperand()->getFlags()); | |||
| 52085 | } | |||
| 52086 | ||||
| 52087 | // If we are saving a 32-byte vector and 32-byte stores are slow, such as on | |||
| 52088 | // Sandy Bridge, perform two 16-byte stores. | |||
| 52089 | unsigned Fast; | |||
| 52090 | if (VT.is256BitVector() && StVT == VT && | |||
| 52091 | TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, | |||
| 52092 | *St->getMemOperand(), &Fast) && | |||
| 52093 | !Fast) { | |||
| 52094 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 52095 | if (NumElems < 2) | |||
| 52096 | return SDValue(); | |||
| 52097 | ||||
| 52098 | return splitVectorStore(St, DAG); | |||
| 52099 | } | |||
| 52100 | ||||
| 52101 | // Split under-aligned vector non-temporal stores. | |||
| 52102 | if (St->isNonTemporal() && StVT == VT && | |||
| 52103 | St->getAlign().value() < VT.getStoreSize()) { | |||
| 52104 | // ZMM/YMM nt-stores - either it can be stored as a series of shorter | |||
| 52105 | // vectors or the legalizer can scalarize it to use MOVNTI. | |||
| 52106 | if (VT.is256BitVector() || VT.is512BitVector()) { | |||
| 52107 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 52108 | if (NumElems < 2) | |||
| 52109 | return SDValue(); | |||
| 52110 | return splitVectorStore(St, DAG); | |||
| 52111 | } | |||
| 52112 | ||||
| 52113 | // XMM nt-stores - scalarize this to f64 nt-stores on SSE4A, else i32/i64 | |||
| 52114 | // to use MOVNTI. | |||
| 52115 | if (VT.is128BitVector() && Subtarget.hasSSE2()) { | |||
| 52116 | MVT NTVT = Subtarget.hasSSE4A() | |||
| 52117 | ? MVT::v2f64 | |||
| 52118 | : (TLI.isTypeLegal(MVT::i64) ? MVT::v2i64 : MVT::v4i32); | |||
| 52119 | return scalarizeVectorStore(St, NTVT, DAG); | |||
| 52120 | } | |||
| 52121 | } | |||
| 52122 | ||||
| 52123 | // Try to optimize v16i16->v16i8 truncating stores when BWI is not | |||
| 52124 | // supported, but avx512f is by extending to v16i32 and truncating. | |||
| 52125 | if (!St->isTruncatingStore() && VT == MVT::v16i8 && !Subtarget.hasBWI() && | |||
| 52126 | St->getValue().getOpcode() == ISD::TRUNCATE && | |||
| 52127 | St->getValue().getOperand(0).getValueType() == MVT::v16i16 && | |||
| 52128 | TLI.isTruncStoreLegal(MVT::v16i32, MVT::v16i8) && | |||
| 52129 | St->getValue().hasOneUse() && !DCI.isBeforeLegalizeOps()) { | |||
| 52130 | SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::v16i32, | |||
| 52131 | St->getValue().getOperand(0)); | |||
| 52132 | return DAG.getTruncStore(St->getChain(), dl, Ext, St->getBasePtr(), | |||
| 52133 | MVT::v16i8, St->getMemOperand()); | |||
| 52134 | } | |||
| 52135 | ||||
| 52136 | // Try to fold a VTRUNCUS or VTRUNCS into a truncating store. | |||
| 52137 | if (!St->isTruncatingStore() && | |||
| 52138 | (StoredVal.getOpcode() == X86ISD::VTRUNCUS || | |||
| 52139 | StoredVal.getOpcode() == X86ISD::VTRUNCS) && | |||
| 52140 | StoredVal.hasOneUse() && | |||
| 52141 | TLI.isTruncStoreLegal(StoredVal.getOperand(0).getValueType(), VT)) { | |||
| 52142 | bool IsSigned = StoredVal.getOpcode() == X86ISD::VTRUNCS; | |||
| 52143 | return EmitTruncSStore(IsSigned, St->getChain(), | |||
| 52144 | dl, StoredVal.getOperand(0), St->getBasePtr(), | |||
| 52145 | VT, St->getMemOperand(), DAG); | |||
| 52146 | } | |||
| 52147 | ||||
| 52148 | // Try to fold a extract_element(VTRUNC) pattern into a truncating store. | |||
| 52149 | if (!St->isTruncatingStore()) { | |||
| 52150 | auto IsExtractedElement = [](SDValue V) { | |||
| 52151 | if (V.getOpcode() == ISD::TRUNCATE && V.hasOneUse()) | |||
| 52152 | V = V.getOperand(0); | |||
| 52153 | unsigned Opc = V.getOpcode(); | |||
| 52154 | if ((Opc == ISD::EXTRACT_VECTOR_ELT || Opc == X86ISD::PEXTRW) && | |||
| 52155 | isNullConstant(V.getOperand(1)) && V.hasOneUse() && | |||
| 52156 | V.getOperand(0).hasOneUse()) | |||
| 52157 | return V.getOperand(0); | |||
| 52158 | return SDValue(); | |||
| 52159 | }; | |||
| 52160 | if (SDValue Extract = IsExtractedElement(StoredVal)) { | |||
| 52161 | SDValue Trunc = peekThroughOneUseBitcasts(Extract); | |||
| 52162 | if (Trunc.getOpcode() == X86ISD::VTRUNC) { | |||
| 52163 | SDValue Src = Trunc.getOperand(0); | |||
| 52164 | MVT DstVT = Trunc.getSimpleValueType(); | |||
| 52165 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 52166 | unsigned NumSrcElts = SrcVT.getVectorNumElements(); | |||
| 52167 | unsigned NumTruncBits = DstVT.getScalarSizeInBits() * NumSrcElts; | |||
| 52168 | MVT TruncVT = MVT::getVectorVT(DstVT.getScalarType(), NumSrcElts); | |||
| 52169 | if (NumTruncBits == VT.getSizeInBits() && | |||
| 52170 | TLI.isTruncStoreLegal(SrcVT, TruncVT)) { | |||
| 52171 | return DAG.getTruncStore(St->getChain(), dl, Src, St->getBasePtr(), | |||
| 52172 | TruncVT, St->getMemOperand()); | |||
| 52173 | } | |||
| 52174 | } | |||
| 52175 | } | |||
| 52176 | } | |||
| 52177 | ||||
| 52178 | // Optimize trunc store (of multiple scalars) to shuffle and store. | |||
| 52179 | // First, pack all of the elements in one place. Next, store to memory | |||
| 52180 | // in fewer chunks. | |||
| 52181 | if (St->isTruncatingStore() && VT.isVector()) { | |||
| 52182 | // Check if we can detect an AVG pattern from the truncation. If yes, | |||
| 52183 | // replace the trunc store by a normal store with the result of X86ISD::AVG | |||
| 52184 | // instruction. | |||
| 52185 | if (DCI.isBeforeLegalize() || TLI.isTypeLegal(St->getMemoryVT())) | |||
| 52186 | if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG, | |||
| 52187 | Subtarget, dl)) | |||
| 52188 | return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(), | |||
| 52189 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52190 | St->getMemOperand()->getFlags()); | |||
| 52191 | ||||
| 52192 | if (TLI.isTruncStoreLegal(VT, StVT)) { | |||
| 52193 | if (SDValue Val = detectSSatPattern(St->getValue(), St->getMemoryVT())) | |||
| 52194 | return EmitTruncSStore(true /* Signed saturation */, St->getChain(), | |||
| 52195 | dl, Val, St->getBasePtr(), | |||
| 52196 | St->getMemoryVT(), St->getMemOperand(), DAG); | |||
| 52197 | if (SDValue Val = detectUSatPattern(St->getValue(), St->getMemoryVT(), | |||
| 52198 | DAG, dl)) | |||
| 52199 | return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(), | |||
| 52200 | dl, Val, St->getBasePtr(), | |||
| 52201 | St->getMemoryVT(), St->getMemOperand(), DAG); | |||
| 52202 | } | |||
| 52203 | ||||
| 52204 | return SDValue(); | |||
| 52205 | } | |||
| 52206 | ||||
| 52207 | // Cast ptr32 and ptr64 pointers to the default address space before a store. | |||
| 52208 | unsigned AddrSpace = St->getAddressSpace(); | |||
| 52209 | if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR || | |||
| 52210 | AddrSpace == X86AS::PTR32_UPTR) { | |||
| 52211 | MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 52212 | if (PtrVT != St->getBasePtr().getSimpleValueType()) { | |||
| 52213 | SDValue Cast = | |||
| 52214 | DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0); | |||
| 52215 | return DAG.getStore(St->getChain(), dl, StoredVal, Cast, | |||
| 52216 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52217 | St->getMemOperand()->getFlags(), St->getAAInfo()); | |||
| 52218 | } | |||
| 52219 | } | |||
| 52220 | ||||
| 52221 | // Turn load->store of MMX types into GPR load/stores. This avoids clobbering | |||
| 52222 | // the FP state in cases where an emms may be missing. | |||
| 52223 | // A preferable solution to the general problem is to figure out the right | |||
| 52224 | // places to insert EMMS. This qualifies as a quick hack. | |||
| 52225 | ||||
| 52226 | // Similarly, turn load->store of i64 into double load/stores in 32-bit mode. | |||
| 52227 | if (VT.getSizeInBits() != 64) | |||
| 52228 | return SDValue(); | |||
| 52229 | ||||
| 52230 | const Function &F = DAG.getMachineFunction().getFunction(); | |||
| 52231 | bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); | |||
| 52232 | bool F64IsLegal = | |||
| 52233 | !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); | |||
| 52234 | if ((VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit()) && | |||
| 52235 | isa<LoadSDNode>(St->getValue()) && | |||
| 52236 | cast<LoadSDNode>(St->getValue())->isSimple() && | |||
| 52237 | St->getChain().hasOneUse() && St->isSimple()) { | |||
| 52238 | LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); | |||
| 52239 | ||||
| 52240 | if (!ISD::isNormalLoad(Ld)) | |||
| 52241 | return SDValue(); | |||
| 52242 | ||||
| 52243 | // Avoid the transformation if there are multiple uses of the loaded value. | |||
| 52244 | if (!Ld->hasNUsesOfValue(1, 0)) | |||
| 52245 | return SDValue(); | |||
| 52246 | ||||
| 52247 | SDLoc LdDL(Ld); | |||
| 52248 | SDLoc StDL(N); | |||
| 52249 | // Lower to a single movq load/store pair. | |||
| 52250 | SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(), | |||
| 52251 | Ld->getBasePtr(), Ld->getMemOperand()); | |||
| 52252 | ||||
| 52253 | // Make sure new load is placed in same chain order. | |||
| 52254 | DAG.makeEquivalentMemoryOrdering(Ld, NewLd); | |||
| 52255 | return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), | |||
| 52256 | St->getMemOperand()); | |||
| 52257 | } | |||
| 52258 | ||||
| 52259 | // This is similar to the above case, but here we handle a scalar 64-bit | |||
| 52260 | // integer store that is extracted from a vector on a 32-bit target. | |||
| 52261 | // If we have SSE2, then we can treat it like a floating-point double | |||
| 52262 | // to get past legalization. The execution dependencies fixup pass will | |||
| 52263 | // choose the optimal machine instruction for the store if this really is | |||
| 52264 | // an integer or v2f32 rather than an f64. | |||
| 52265 | if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() && | |||
| 52266 | St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) { | |||
| 52267 | SDValue OldExtract = St->getOperand(1); | |||
| 52268 | SDValue ExtOp0 = OldExtract.getOperand(0); | |||
| 52269 | unsigned VecSize = ExtOp0.getValueSizeInBits(); | |||
| 52270 | EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64); | |||
| 52271 | SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0); | |||
| 52272 | SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, | |||
| 52273 | BitCast, OldExtract.getOperand(1)); | |||
| 52274 | return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(), | |||
| 52275 | St->getPointerInfo(), St->getOriginalAlign(), | |||
| 52276 | St->getMemOperand()->getFlags()); | |||
| 52277 | } | |||
| 52278 | ||||
| 52279 | return SDValue(); | |||
| 52280 | } | |||
| 52281 | ||||
| 52282 | static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG, | |||
| 52283 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 52284 | const X86Subtarget &Subtarget) { | |||
| 52285 | auto *St = cast<MemIntrinsicSDNode>(N); | |||
| 52286 | ||||
| 52287 | SDValue StoredVal = N->getOperand(1); | |||
| 52288 | MVT VT = StoredVal.getSimpleValueType(); | |||
| 52289 | EVT MemVT = St->getMemoryVT(); | |||
| 52290 | ||||
| 52291 | // Figure out which elements we demand. | |||
| 52292 | unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits(); | |||
| 52293 | APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts); | |||
| 52294 | ||||
| 52295 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 52296 | if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, DCI)) { | |||
| 52297 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 52298 | DCI.AddToWorklist(N); | |||
| 52299 | return SDValue(N, 0); | |||
| 52300 | } | |||
| 52301 | ||||
| 52302 | return SDValue(); | |||
| 52303 | } | |||
| 52304 | ||||
| 52305 | /// Return 'true' if this vector operation is "horizontal" | |||
| 52306 | /// and return the operands for the horizontal operation in LHS and RHS. A | |||
| 52307 | /// horizontal operation performs the binary operation on successive elements | |||
| 52308 | /// of its first operand, then on successive elements of its second operand, | |||
| 52309 | /// returning the resulting values in a vector. For example, if | |||
| 52310 | /// A = < float a0, float a1, float a2, float a3 > | |||
| 52311 | /// and | |||
| 52312 | /// B = < float b0, float b1, float b2, float b3 > | |||
| 52313 | /// then the result of doing a horizontal operation on A and B is | |||
| 52314 | /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >. | |||
| 52315 | /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form | |||
| 52316 | /// A horizontal-op B, for some already available A and B, and if so then LHS is | |||
| 52317 | /// set to A, RHS to B, and the routine returns 'true'. | |||
| 52318 | static bool isHorizontalBinOp(unsigned HOpcode, SDValue &LHS, SDValue &RHS, | |||
| 52319 | SelectionDAG &DAG, const X86Subtarget &Subtarget, | |||
| 52320 | bool IsCommutative, | |||
| 52321 | SmallVectorImpl<int> &PostShuffleMask) { | |||
| 52322 | // If either operand is undef, bail out. The binop should be simplified. | |||
| 52323 | if (LHS.isUndef() || RHS.isUndef()) | |||
| 52324 | return false; | |||
| 52325 | ||||
| 52326 | // Look for the following pattern: | |||
| 52327 | // A = < float a0, float a1, float a2, float a3 > | |||
| 52328 | // B = < float b0, float b1, float b2, float b3 > | |||
| 52329 | // and | |||
| 52330 | // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6> | |||
| 52331 | // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7> | |||
| 52332 | // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 > | |||
| 52333 | // which is A horizontal-op B. | |||
| 52334 | ||||
| 52335 | MVT VT = LHS.getSimpleValueType(); | |||
| 52336 | assert((VT.is128BitVector() || VT.is256BitVector()) &&(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector ()) && "Unsupported vector type for horizontal add/sub" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__ __PRETTY_FUNCTION__)) | |||
| 52337 | "Unsupported vector type for horizontal add/sub")(static_cast <bool> ((VT.is128BitVector() || VT.is256BitVector ()) && "Unsupported vector type for horizontal add/sub" ) ? void (0) : __assert_fail ("(VT.is128BitVector() || VT.is256BitVector()) && \"Unsupported vector type for horizontal add/sub\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52337, __extension__ __PRETTY_FUNCTION__)); | |||
| 52338 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 52339 | ||||
| 52340 | auto GetShuffle = [&](SDValue Op, SDValue &N0, SDValue &N1, | |||
| 52341 | SmallVectorImpl<int> &ShuffleMask) { | |||
| 52342 | bool UseSubVector = false; | |||
| 52343 | if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 52344 | Op.getOperand(0).getValueType().is256BitVector() && | |||
| 52345 | llvm::isNullConstant(Op.getOperand(1))) { | |||
| 52346 | Op = Op.getOperand(0); | |||
| 52347 | UseSubVector = true; | |||
| 52348 | } | |||
| 52349 | SmallVector<SDValue, 2> SrcOps; | |||
| 52350 | SmallVector<int, 16> SrcMask, ScaledMask; | |||
| 52351 | SDValue BC = peekThroughBitcasts(Op); | |||
| 52352 | if (getTargetShuffleInputs(BC, SrcOps, SrcMask, DAG) && | |||
| 52353 | !isAnyZero(SrcMask) && all_of(SrcOps, [BC](SDValue Op) { | |||
| 52354 | return Op.getValueSizeInBits() == BC.getValueSizeInBits(); | |||
| 52355 | })) { | |||
| 52356 | resolveTargetShuffleInputsAndMask(SrcOps, SrcMask); | |||
| 52357 | if (!UseSubVector && SrcOps.size() <= 2 && | |||
| 52358 | scaleShuffleElements(SrcMask, NumElts, ScaledMask)) { | |||
| 52359 | N0 = !SrcOps.empty() ? SrcOps[0] : SDValue(); | |||
| 52360 | N1 = SrcOps.size() > 1 ? SrcOps[1] : SDValue(); | |||
| 52361 | ShuffleMask.assign(ScaledMask.begin(), ScaledMask.end()); | |||
| 52362 | } | |||
| 52363 | if (UseSubVector && SrcOps.size() == 1 && | |||
| 52364 | scaleShuffleElements(SrcMask, 2 * NumElts, ScaledMask)) { | |||
| 52365 | std::tie(N0, N1) = DAG.SplitVector(SrcOps[0], SDLoc(Op)); | |||
| 52366 | ArrayRef<int> Mask = ArrayRef<int>(ScaledMask).slice(0, NumElts); | |||
| 52367 | ShuffleMask.assign(Mask.begin(), Mask.end()); | |||
| 52368 | } | |||
| 52369 | } | |||
| 52370 | }; | |||
| 52371 | ||||
| 52372 | // View LHS in the form | |||
| 52373 | // LHS = VECTOR_SHUFFLE A, B, LMask | |||
| 52374 | // If LHS is not a shuffle, then pretend it is the identity shuffle: | |||
| 52375 | // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1> | |||
| 52376 | // NOTE: A default initialized SDValue represents an UNDEF of type VT. | |||
| 52377 | SDValue A, B; | |||
| 52378 | SmallVector<int, 16> LMask; | |||
| 52379 | GetShuffle(LHS, A, B, LMask); | |||
| 52380 | ||||
| 52381 | // Likewise, view RHS in the form | |||
| 52382 | // RHS = VECTOR_SHUFFLE C, D, RMask | |||
| 52383 | SDValue C, D; | |||
| 52384 | SmallVector<int, 16> RMask; | |||
| 52385 | GetShuffle(RHS, C, D, RMask); | |||
| 52386 | ||||
| 52387 | // At least one of the operands should be a vector shuffle. | |||
| 52388 | unsigned NumShuffles = (LMask.empty() ? 0 : 1) + (RMask.empty() ? 0 : 1); | |||
| 52389 | if (NumShuffles == 0) | |||
| 52390 | return false; | |||
| 52391 | ||||
| 52392 | if (LMask.empty()) { | |||
| 52393 | A = LHS; | |||
| 52394 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 52395 | LMask.push_back(i); | |||
| 52396 | } | |||
| 52397 | ||||
| 52398 | if (RMask.empty()) { | |||
| 52399 | C = RHS; | |||
| 52400 | for (unsigned i = 0; i != NumElts; ++i) | |||
| 52401 | RMask.push_back(i); | |||
| 52402 | } | |||
| 52403 | ||||
| 52404 | // If we have an unary mask, ensure the other op is set to null. | |||
| 52405 | if (isUndefOrInRange(LMask, 0, NumElts)) | |||
| 52406 | B = SDValue(); | |||
| 52407 | else if (isUndefOrInRange(LMask, NumElts, NumElts * 2)) | |||
| 52408 | A = SDValue(); | |||
| 52409 | ||||
| 52410 | if (isUndefOrInRange(RMask, 0, NumElts)) | |||
| 52411 | D = SDValue(); | |||
| 52412 | else if (isUndefOrInRange(RMask, NumElts, NumElts * 2)) | |||
| 52413 | C = SDValue(); | |||
| 52414 | ||||
| 52415 | // If A and B occur in reverse order in RHS, then canonicalize by commuting | |||
| 52416 | // RHS operands and shuffle mask. | |||
| 52417 | if (A != C) { | |||
| 52418 | std::swap(C, D); | |||
| 52419 | ShuffleVectorSDNode::commuteMask(RMask); | |||
| 52420 | } | |||
| 52421 | // Check that the shuffles are both shuffling the same vectors. | |||
| 52422 | if (!(A == C && B == D)) | |||
| 52423 | return false; | |||
| 52424 | ||||
| 52425 | PostShuffleMask.clear(); | |||
| 52426 | PostShuffleMask.append(NumElts, SM_SentinelUndef); | |||
| 52427 | ||||
| 52428 | // LHS and RHS are now: | |||
| 52429 | // LHS = shuffle A, B, LMask | |||
| 52430 | // RHS = shuffle A, B, RMask | |||
| 52431 | // Check that the masks correspond to performing a horizontal operation. | |||
| 52432 | // AVX defines horizontal add/sub to operate independently on 128-bit lanes, | |||
| 52433 | // so we just repeat the inner loop if this is a 256-bit op. | |||
| 52434 | unsigned Num128BitChunks = VT.getSizeInBits() / 128; | |||
| 52435 | unsigned NumEltsPer128BitChunk = NumElts / Num128BitChunks; | |||
| 52436 | unsigned NumEltsPer64BitChunk = NumEltsPer128BitChunk / 2; | |||
| 52437 | assert((NumEltsPer128BitChunk % 2 == 0) &&(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane" ) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__ __PRETTY_FUNCTION__)) | |||
| 52438 | "Vector type should have an even number of elements in each lane")(static_cast <bool> ((NumEltsPer128BitChunk % 2 == 0) && "Vector type should have an even number of elements in each lane" ) ? void (0) : __assert_fail ("(NumEltsPer128BitChunk % 2 == 0) && \"Vector type should have an even number of elements in each lane\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52438, __extension__ __PRETTY_FUNCTION__)); | |||
| 52439 | for (unsigned j = 0; j != NumElts; j += NumEltsPer128BitChunk) { | |||
| 52440 | for (unsigned i = 0; i != NumEltsPer128BitChunk; ++i) { | |||
| 52441 | // Ignore undefined components. | |||
| 52442 | int LIdx = LMask[i + j], RIdx = RMask[i + j]; | |||
| 52443 | if (LIdx < 0 || RIdx < 0 || | |||
| 52444 | (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) || | |||
| 52445 | (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts))) | |||
| 52446 | continue; | |||
| 52447 | ||||
| 52448 | // Check that successive odd/even elements are being operated on. If not, | |||
| 52449 | // this is not a horizontal operation. | |||
| 52450 | if (!((RIdx & 1) == 1 && (LIdx + 1) == RIdx) && | |||
| 52451 | !((LIdx & 1) == 1 && (RIdx + 1) == LIdx && IsCommutative)) | |||
| 52452 | return false; | |||
| 52453 | ||||
| 52454 | // Compute the post-shuffle mask index based on where the element | |||
| 52455 | // is stored in the HOP result, and where it needs to be moved to. | |||
| 52456 | int Base = LIdx & ~1u; | |||
| 52457 | int Index = ((Base % NumEltsPer128BitChunk) / 2) + | |||
| 52458 | ((Base % NumElts) & ~(NumEltsPer128BitChunk - 1)); | |||
| 52459 | ||||
| 52460 | // The low half of the 128-bit result must choose from A. | |||
| 52461 | // The high half of the 128-bit result must choose from B, | |||
| 52462 | // unless B is undef. In that case, we are always choosing from A. | |||
| 52463 | if ((B && Base >= (int)NumElts) || (!B && i >= NumEltsPer64BitChunk)) | |||
| 52464 | Index += NumEltsPer64BitChunk; | |||
| 52465 | PostShuffleMask[i + j] = Index; | |||
| 52466 | } | |||
| 52467 | } | |||
| 52468 | ||||
| 52469 | SDValue NewLHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it. | |||
| 52470 | SDValue NewRHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it. | |||
| 52471 | ||||
| 52472 | bool IsIdentityPostShuffle = | |||
| 52473 | isSequentialOrUndefInRange(PostShuffleMask, 0, NumElts, 0); | |||
| 52474 | if (IsIdentityPostShuffle) | |||
| 52475 | PostShuffleMask.clear(); | |||
| 52476 | ||||
| 52477 | // Avoid 128-bit multi lane shuffles if pre-AVX2 and FP (integer will split). | |||
| 52478 | if (!IsIdentityPostShuffle && !Subtarget.hasAVX2() && VT.isFloatingPoint() && | |||
| 52479 | isMultiLaneShuffleMask(128, VT.getScalarSizeInBits(), PostShuffleMask)) | |||
| 52480 | return false; | |||
| 52481 | ||||
| 52482 | // If the source nodes are already used in HorizOps then always accept this. | |||
| 52483 | // Shuffle folding should merge these back together. | |||
| 52484 | bool FoundHorizLHS = llvm::any_of(NewLHS->uses(), [&](SDNode *User) { | |||
| 52485 | return User->getOpcode() == HOpcode && User->getValueType(0) == VT; | |||
| 52486 | }); | |||
| 52487 | bool FoundHorizRHS = llvm::any_of(NewRHS->uses(), [&](SDNode *User) { | |||
| 52488 | return User->getOpcode() == HOpcode && User->getValueType(0) == VT; | |||
| 52489 | }); | |||
| 52490 | bool ForceHorizOp = FoundHorizLHS && FoundHorizRHS; | |||
| 52491 | ||||
| 52492 | // Assume a SingleSource HOP if we only shuffle one input and don't need to | |||
| 52493 | // shuffle the result. | |||
| 52494 | if (!ForceHorizOp && | |||
| 52495 | !shouldUseHorizontalOp(NewLHS == NewRHS && | |||
| 52496 | (NumShuffles < 2 || !IsIdentityPostShuffle), | |||
| 52497 | DAG, Subtarget)) | |||
| 52498 | return false; | |||
| 52499 | ||||
| 52500 | LHS = DAG.getBitcast(VT, NewLHS); | |||
| 52501 | RHS = DAG.getBitcast(VT, NewRHS); | |||
| 52502 | return true; | |||
| 52503 | } | |||
| 52504 | ||||
| 52505 | // Try to synthesize horizontal (f)hadd/hsub from (f)adds/subs of shuffles. | |||
| 52506 | static SDValue combineToHorizontalAddSub(SDNode *N, SelectionDAG &DAG, | |||
| 52507 | const X86Subtarget &Subtarget) { | |||
| 52508 | EVT VT = N->getValueType(0); | |||
| 52509 | unsigned Opcode = N->getOpcode(); | |||
| 52510 | bool IsAdd = (Opcode == ISD::FADD) || (Opcode == ISD::ADD); | |||
| 52511 | SmallVector<int, 8> PostShuffleMask; | |||
| 52512 | ||||
| 52513 | switch (Opcode) { | |||
| 52514 | case ISD::FADD: | |||
| 52515 | case ISD::FSUB: | |||
| 52516 | if ((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || | |||
| 52517 | (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { | |||
| 52518 | SDValue LHS = N->getOperand(0); | |||
| 52519 | SDValue RHS = N->getOperand(1); | |||
| 52520 | auto HorizOpcode = IsAdd ? X86ISD::FHADD : X86ISD::FHSUB; | |||
| 52521 | if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, | |||
| 52522 | PostShuffleMask)) { | |||
| 52523 | SDValue HorizBinOp = DAG.getNode(HorizOpcode, SDLoc(N), VT, LHS, RHS); | |||
| 52524 | if (!PostShuffleMask.empty()) | |||
| 52525 | HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, | |||
| 52526 | DAG.getUNDEF(VT), PostShuffleMask); | |||
| 52527 | return HorizBinOp; | |||
| 52528 | } | |||
| 52529 | } | |||
| 52530 | break; | |||
| 52531 | case ISD::ADD: | |||
| 52532 | case ISD::SUB: | |||
| 52533 | if (Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32 || | |||
| 52534 | VT == MVT::v16i16 || VT == MVT::v8i32)) { | |||
| 52535 | SDValue LHS = N->getOperand(0); | |||
| 52536 | SDValue RHS = N->getOperand(1); | |||
| 52537 | auto HorizOpcode = IsAdd ? X86ISD::HADD : X86ISD::HSUB; | |||
| 52538 | if (isHorizontalBinOp(HorizOpcode, LHS, RHS, DAG, Subtarget, IsAdd, | |||
| 52539 | PostShuffleMask)) { | |||
| 52540 | auto HOpBuilder = [HorizOpcode](SelectionDAG &DAG, const SDLoc &DL, | |||
| 52541 | ArrayRef<SDValue> Ops) { | |||
| 52542 | return DAG.getNode(HorizOpcode, DL, Ops[0].getValueType(), Ops); | |||
| 52543 | }; | |||
| 52544 | SDValue HorizBinOp = SplitOpsAndApply(DAG, Subtarget, SDLoc(N), VT, | |||
| 52545 | {LHS, RHS}, HOpBuilder); | |||
| 52546 | if (!PostShuffleMask.empty()) | |||
| 52547 | HorizBinOp = DAG.getVectorShuffle(VT, SDLoc(HorizBinOp), HorizBinOp, | |||
| 52548 | DAG.getUNDEF(VT), PostShuffleMask); | |||
| 52549 | return HorizBinOp; | |||
| 52550 | } | |||
| 52551 | } | |||
| 52552 | break; | |||
| 52553 | } | |||
| 52554 | ||||
| 52555 | return SDValue(); | |||
| 52556 | } | |||
| 52557 | ||||
| 52558 | // Try to combine the following nodes | |||
| 52559 | // t29: i64 = X86ISD::Wrapper TargetConstantPool:i64 | |||
| 52560 | // <i32 -2147483648[float -0.000000e+00]> 0 | |||
| 52561 | // t27: v16i32[v16f32],ch = X86ISD::VBROADCAST_LOAD | |||
| 52562 | // <(load 4 from constant-pool)> t0, t29 | |||
| 52563 | // [t30: v16i32 = bitcast t27] | |||
| 52564 | // t6: v16i32 = xor t7, t27[t30] | |||
| 52565 | // t11: v16f32 = bitcast t6 | |||
| 52566 | // t21: v16f32 = X86ISD::VFMULC[X86ISD::VCFMULC] t11, t8 | |||
| 52567 | // into X86ISD::VFCMULC[X86ISD::VFMULC] if possible: | |||
| 52568 | // t22: v16f32 = bitcast t7 | |||
| 52569 | // t23: v16f32 = X86ISD::VFCMULC[X86ISD::VFMULC] t8, t22 | |||
| 52570 | // t24: v32f16 = bitcast t23 | |||
| 52571 | static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG, | |||
| 52572 | const X86Subtarget &Subtarget) { | |||
| 52573 | EVT VT = N->getValueType(0); | |||
| 52574 | SDValue LHS = N->getOperand(0); | |||
| 52575 | SDValue RHS = N->getOperand(1); | |||
| 52576 | int CombineOpcode = | |||
| 52577 | N->getOpcode() == X86ISD::VFCMULC ? X86ISD::VFMULC : X86ISD::VFCMULC; | |||
| 52578 | auto isConjugationConstant = [](const Constant *c) { | |||
| 52579 | if (const auto *CI = dyn_cast<ConstantInt>(c)) { | |||
| 52580 | APInt ConjugationInt32 = APInt(32, 0x80000000, true); | |||
| 52581 | APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true); | |||
| 52582 | switch (CI->getBitWidth()) { | |||
| 52583 | case 16: | |||
| 52584 | return false; | |||
| 52585 | case 32: | |||
| 52586 | return CI->getValue() == ConjugationInt32; | |||
| 52587 | case 64: | |||
| 52588 | return CI->getValue() == ConjugationInt64; | |||
| 52589 | default: | |||
| 52590 | llvm_unreachable("Unexpected bit width")::llvm::llvm_unreachable_internal("Unexpected bit width", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 52590); | |||
| 52591 | } | |||
| 52592 | } | |||
| 52593 | if (const auto *CF = dyn_cast<ConstantFP>(c)) | |||
| 52594 | return CF->isNegativeZeroValue(); | |||
| 52595 | return false; | |||
| 52596 | }; | |||
| 52597 | auto combineConjugation = [&](SDValue &r) { | |||
| 52598 | if (LHS->getOpcode() == ISD::BITCAST && RHS.hasOneUse()) { | |||
| 52599 | SDValue XOR = LHS.getOperand(0); | |||
| 52600 | if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) { | |||
| 52601 | SDValue XORRHS = XOR.getOperand(1); | |||
| 52602 | if (XORRHS.getOpcode() == ISD::BITCAST && XORRHS.hasOneUse()) | |||
| 52603 | XORRHS = XORRHS.getOperand(0); | |||
| 52604 | if (XORRHS.getOpcode() == X86ISD::VBROADCAST_LOAD && | |||
| 52605 | XORRHS.getOperand(1).getNumOperands()) { | |||
| 52606 | ConstantPoolSDNode *CP = | |||
| 52607 | dyn_cast<ConstantPoolSDNode>(XORRHS.getOperand(1).getOperand(0)); | |||
| 52608 | if (CP && isConjugationConstant(CP->getConstVal())) { | |||
| 52609 | SelectionDAG::FlagInserter FlagsInserter(DAG, N); | |||
| 52610 | SDValue I2F = DAG.getBitcast(VT, LHS.getOperand(0).getOperand(0)); | |||
| 52611 | SDValue FCMulC = DAG.getNode(CombineOpcode, SDLoc(N), VT, RHS, I2F); | |||
| 52612 | r = DAG.getBitcast(VT, FCMulC); | |||
| 52613 | return true; | |||
| 52614 | } | |||
| 52615 | } | |||
| 52616 | } | |||
| 52617 | } | |||
| 52618 | return false; | |||
| 52619 | }; | |||
| 52620 | SDValue Res; | |||
| 52621 | if (combineConjugation(Res)) | |||
| 52622 | return Res; | |||
| 52623 | std::swap(LHS, RHS); | |||
| 52624 | if (combineConjugation(Res)) | |||
| 52625 | return Res; | |||
| 52626 | return Res; | |||
| 52627 | } | |||
| 52628 | ||||
| 52629 | // Try to combine the following nodes: | |||
| 52630 | // FADD(A, FMA(B, C, 0)) and FADD(A, FMUL(B, C)) to FMA(B, C, A) | |||
| 52631 | static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG, | |||
| 52632 | const X86Subtarget &Subtarget) { | |||
| 52633 | auto AllowContract = [&DAG](const SDNodeFlags &Flags) { | |||
| 52634 | return DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast || | |||
| 52635 | Flags.hasAllowContract(); | |||
| 52636 | }; | |||
| 52637 | ||||
| 52638 | auto HasNoSignedZero = [&DAG](const SDNodeFlags &Flags) { | |||
| 52639 | return DAG.getTarget().Options.NoSignedZerosFPMath || | |||
| 52640 | Flags.hasNoSignedZeros(); | |||
| 52641 | }; | |||
| 52642 | auto IsVectorAllNegativeZero = [](const SDNode *N) { | |||
| 52643 | if (N->getOpcode() != X86ISD::VBROADCAST_LOAD) | |||
| 52644 | return false; | |||
| 52645 | assert(N->getSimpleValueType(0).getScalarType() == MVT::f32 &&(static_cast <bool> (N->getSimpleValueType(0).getScalarType () == MVT::f32 && "Unexpected vector type!") ? void ( 0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__ __PRETTY_FUNCTION__)) | |||
| 52646 | "Unexpected vector type!")(static_cast <bool> (N->getSimpleValueType(0).getScalarType () == MVT::f32 && "Unexpected vector type!") ? void ( 0) : __assert_fail ("N->getSimpleValueType(0).getScalarType() == MVT::f32 && \"Unexpected vector type!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52646, __extension__ __PRETTY_FUNCTION__)); | |||
| 52647 | if (ConstantPoolSDNode *CP = | |||
| 52648 | dyn_cast<ConstantPoolSDNode>(N->getOperand(1)->getOperand(0))) { | |||
| 52649 | APInt AI = APInt(32, 0x80008000, true); | |||
| 52650 | if (const auto *CI = dyn_cast<ConstantInt>(CP->getConstVal())) | |||
| 52651 | return CI->getValue() == AI; | |||
| 52652 | if (const auto *CF = dyn_cast<ConstantFP>(CP->getConstVal())) | |||
| 52653 | return CF->getValue() == APFloat(APFloat::IEEEsingle(), AI); | |||
| 52654 | } | |||
| 52655 | return false; | |||
| 52656 | }; | |||
| 52657 | ||||
| 52658 | if (N->getOpcode() != ISD::FADD || !Subtarget.hasFP16() || | |||
| 52659 | !AllowContract(N->getFlags())) | |||
| 52660 | return SDValue(); | |||
| 52661 | ||||
| 52662 | EVT VT = N->getValueType(0); | |||
| 52663 | if (VT != MVT::v8f16 && VT != MVT::v16f16 && VT != MVT::v32f16) | |||
| 52664 | return SDValue(); | |||
| 52665 | ||||
| 52666 | SDValue LHS = N->getOperand(0); | |||
| 52667 | SDValue RHS = N->getOperand(1); | |||
| 52668 | bool IsConj; | |||
| 52669 | SDValue FAddOp1, MulOp0, MulOp1; | |||
| 52670 | auto GetCFmulFrom = [&MulOp0, &MulOp1, &IsConj, &AllowContract, | |||
| 52671 | &IsVectorAllNegativeZero, | |||
| 52672 | &HasNoSignedZero](SDValue N) -> bool { | |||
| 52673 | if (!N.hasOneUse() || N.getOpcode() != ISD::BITCAST) | |||
| 52674 | return false; | |||
| 52675 | SDValue Op0 = N.getOperand(0); | |||
| 52676 | unsigned Opcode = Op0.getOpcode(); | |||
| 52677 | if (Op0.hasOneUse() && AllowContract(Op0->getFlags())) { | |||
| 52678 | if ((Opcode == X86ISD::VFMULC || Opcode == X86ISD::VFCMULC)) { | |||
| 52679 | MulOp0 = Op0.getOperand(0); | |||
| 52680 | MulOp1 = Op0.getOperand(1); | |||
| 52681 | IsConj = Opcode == X86ISD::VFCMULC; | |||
| 52682 | return true; | |||
| 52683 | } | |||
| 52684 | if ((Opcode == X86ISD::VFMADDC || Opcode == X86ISD::VFCMADDC) && | |||
| 52685 | ((ISD::isBuildVectorAllZeros(Op0->getOperand(2).getNode()) && | |||
| 52686 | HasNoSignedZero(Op0->getFlags())) || | |||
| 52687 | IsVectorAllNegativeZero(Op0->getOperand(2).getNode()))) { | |||
| 52688 | MulOp0 = Op0.getOperand(0); | |||
| 52689 | MulOp1 = Op0.getOperand(1); | |||
| 52690 | IsConj = Opcode == X86ISD::VFCMADDC; | |||
| 52691 | return true; | |||
| 52692 | } | |||
| 52693 | } | |||
| 52694 | return false; | |||
| 52695 | }; | |||
| 52696 | ||||
| 52697 | if (GetCFmulFrom(LHS)) | |||
| 52698 | FAddOp1 = RHS; | |||
| 52699 | else if (GetCFmulFrom(RHS)) | |||
| 52700 | FAddOp1 = LHS; | |||
| 52701 | else | |||
| 52702 | return SDValue(); | |||
| 52703 | ||||
| 52704 | MVT CVT = MVT::getVectorVT(MVT::f32, VT.getVectorNumElements() / 2); | |||
| 52705 | FAddOp1 = DAG.getBitcast(CVT, FAddOp1); | |||
| 52706 | unsigned NewOp = IsConj ? X86ISD::VFCMADDC : X86ISD::VFMADDC; | |||
| 52707 | // FIXME: How do we handle when fast math flags of FADD are different from | |||
| 52708 | // CFMUL's? | |||
| 52709 | SDValue CFmul = | |||
| 52710 | DAG.getNode(NewOp, SDLoc(N), CVT, MulOp0, MulOp1, FAddOp1, N->getFlags()); | |||
| 52711 | return DAG.getBitcast(VT, CFmul); | |||
| 52712 | } | |||
| 52713 | ||||
| 52714 | /// Do target-specific dag combines on floating-point adds/subs. | |||
| 52715 | static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG, | |||
| 52716 | const X86Subtarget &Subtarget) { | |||
| 52717 | if (SDValue HOp = combineToHorizontalAddSub(N, DAG, Subtarget)) | |||
| 52718 | return HOp; | |||
| 52719 | ||||
| 52720 | if (SDValue COp = combineFaddCFmul(N, DAG, Subtarget)) | |||
| 52721 | return COp; | |||
| 52722 | ||||
| 52723 | return SDValue(); | |||
| 52724 | } | |||
| 52725 | ||||
| 52726 | /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify | |||
| 52727 | /// the codegen. | |||
| 52728 | /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) ) | |||
| 52729 | /// TODO: This overlaps with the generic combiner's visitTRUNCATE. Remove | |||
| 52730 | /// anything that is guaranteed to be transformed by DAGCombiner. | |||
| 52731 | static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, | |||
| 52732 | const X86Subtarget &Subtarget, | |||
| 52733 | const SDLoc &DL) { | |||
| 52734 | assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode")(static_cast <bool> (N->getOpcode() == ISD::TRUNCATE && "Wrong opcode") ? void (0) : __assert_fail ("N->getOpcode() == ISD::TRUNCATE && \"Wrong opcode\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 52734, __extension__ __PRETTY_FUNCTION__)); | |||
| 52735 | SDValue Src = N->getOperand(0); | |||
| 52736 | unsigned SrcOpcode = Src.getOpcode(); | |||
| 52737 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 52738 | ||||
| 52739 | EVT VT = N->getValueType(0); | |||
| 52740 | EVT SrcVT = Src.getValueType(); | |||
| 52741 | ||||
| 52742 | auto IsFreeTruncation = [VT](SDValue Op) { | |||
| 52743 | unsigned TruncSizeInBits = VT.getScalarSizeInBits(); | |||
| 52744 | ||||
| 52745 | // See if this has been extended from a smaller/equal size to | |||
| 52746 | // the truncation size, allowing a truncation to combine with the extend. | |||
| 52747 | unsigned Opcode = Op.getOpcode(); | |||
| 52748 | if ((Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND || | |||
| 52749 | Opcode == ISD::ZERO_EXTEND) && | |||
| 52750 | Op.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits) | |||
| 52751 | return true; | |||
| 52752 | ||||
| 52753 | // See if this is a single use constant which can be constant folded. | |||
| 52754 | // NOTE: We don't peek throught bitcasts here because there is currently | |||
| 52755 | // no support for constant folding truncate+bitcast+vector_of_constants. So | |||
| 52756 | // we'll just send up with a truncate on both operands which will | |||
| 52757 | // get turned back into (truncate (binop)) causing an infinite loop. | |||
| 52758 | return ISD::isBuildVectorOfConstantSDNodes(Op.getNode()); | |||
| 52759 | }; | |||
| 52760 | ||||
| 52761 | auto TruncateArithmetic = [&](SDValue N0, SDValue N1) { | |||
| 52762 | SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0); | |||
| 52763 | SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1); | |||
| 52764 | return DAG.getNode(SrcOpcode, DL, VT, Trunc0, Trunc1); | |||
| 52765 | }; | |||
| 52766 | ||||
| 52767 | // Don't combine if the operation has other uses. | |||
| 52768 | if (!Src.hasOneUse()) | |||
| 52769 | return SDValue(); | |||
| 52770 | ||||
| 52771 | // Only support vector truncation for now. | |||
| 52772 | // TODO: i64 scalar math would benefit as well. | |||
| 52773 | if (!VT.isVector()) | |||
| 52774 | return SDValue(); | |||
| 52775 | ||||
| 52776 | // In most cases its only worth pre-truncating if we're only facing the cost | |||
| 52777 | // of one truncation. | |||
| 52778 | // i.e. if one of the inputs will constant fold or the input is repeated. | |||
| 52779 | switch (SrcOpcode) { | |||
| 52780 | case ISD::MUL: | |||
| 52781 | // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its | |||
| 52782 | // better to truncate if we have the chance. | |||
| 52783 | if (SrcVT.getScalarType() == MVT::i64 && | |||
| 52784 | TLI.isOperationLegal(SrcOpcode, VT) && | |||
| 52785 | !TLI.isOperationLegal(SrcOpcode, SrcVT)) | |||
| 52786 | return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); | |||
| 52787 | [[fallthrough]]; | |||
| 52788 | case ISD::AND: | |||
| 52789 | case ISD::XOR: | |||
| 52790 | case ISD::OR: | |||
| 52791 | case ISD::ADD: | |||
| 52792 | case ISD::SUB: { | |||
| 52793 | SDValue Op0 = Src.getOperand(0); | |||
| 52794 | SDValue Op1 = Src.getOperand(1); | |||
| 52795 | if (TLI.isOperationLegal(SrcOpcode, VT) && | |||
| 52796 | (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1))) | |||
| 52797 | return TruncateArithmetic(Op0, Op1); | |||
| 52798 | break; | |||
| 52799 | } | |||
| 52800 | } | |||
| 52801 | ||||
| 52802 | return SDValue(); | |||
| 52803 | } | |||
| 52804 | ||||
| 52805 | /// Truncate using ISD::AND mask and X86ISD::PACKUS. | |||
| 52806 | /// e.g. trunc <8 x i32> X to <8 x i16> --> | |||
| 52807 | /// MaskX = X & 0xffff (clear high bits to prevent saturation) | |||
| 52808 | /// packus (extract_subv MaskX, 0), (extract_subv MaskX, 1) | |||
| 52809 | static SDValue combineVectorTruncationWithPACKUS(SDNode *N, const SDLoc &DL, | |||
| 52810 | const X86Subtarget &Subtarget, | |||
| 52811 | SelectionDAG &DAG) { | |||
| 52812 | SDValue In = N->getOperand(0); | |||
| 52813 | EVT InVT = In.getValueType(); | |||
| 52814 | EVT OutVT = N->getValueType(0); | |||
| 52815 | ||||
| 52816 | APInt Mask = APInt::getLowBitsSet(InVT.getScalarSizeInBits(), | |||
| 52817 | OutVT.getScalarSizeInBits()); | |||
| 52818 | In = DAG.getNode(ISD::AND, DL, InVT, In, DAG.getConstant(Mask, DL, InVT)); | |||
| 52819 | return truncateVectorWithPACK(X86ISD::PACKUS, OutVT, In, DL, DAG, Subtarget); | |||
| 52820 | } | |||
| 52821 | ||||
| 52822 | /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS. | |||
| 52823 | static SDValue combineVectorTruncationWithPACKSS(SDNode *N, const SDLoc &DL, | |||
| 52824 | const X86Subtarget &Subtarget, | |||
| 52825 | SelectionDAG &DAG) { | |||
| 52826 | SDValue In = N->getOperand(0); | |||
| 52827 | EVT InVT = In.getValueType(); | |||
| 52828 | EVT OutVT = N->getValueType(0); | |||
| 52829 | In = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InVT, In, | |||
| 52830 | DAG.getValueType(OutVT)); | |||
| 52831 | return truncateVectorWithPACK(X86ISD::PACKSS, OutVT, In, DL, DAG, Subtarget); | |||
| 52832 | } | |||
| 52833 | ||||
| 52834 | /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into | |||
| 52835 | /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type | |||
| 52836 | /// legalization the truncation will be translated into a BUILD_VECTOR with each | |||
| 52837 | /// element that is extracted from a vector and then truncated, and it is | |||
| 52838 | /// difficult to do this optimization based on them. | |||
| 52839 | static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, | |||
| 52840 | const X86Subtarget &Subtarget) { | |||
| 52841 | EVT OutVT = N->getValueType(0); | |||
| 52842 | if (!OutVT.isVector()) | |||
| 52843 | return SDValue(); | |||
| 52844 | ||||
| 52845 | SDValue In = N->getOperand(0); | |||
| 52846 | if (!In.getValueType().isSimple()) | |||
| 52847 | return SDValue(); | |||
| 52848 | ||||
| 52849 | EVT InVT = In.getValueType(); | |||
| 52850 | unsigned NumElems = OutVT.getVectorNumElements(); | |||
| 52851 | ||||
| 52852 | // AVX512 provides fast truncate ops. | |||
| 52853 | if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) | |||
| 52854 | return SDValue(); | |||
| 52855 | ||||
| 52856 | EVT OutSVT = OutVT.getVectorElementType(); | |||
| 52857 | EVT InSVT = InVT.getVectorElementType(); | |||
| 52858 | if (!((InSVT == MVT::i16 || InSVT == MVT::i32 || InSVT == MVT::i64) && | |||
| 52859 | (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) && | |||
| 52860 | NumElems >= 8)) | |||
| 52861 | return SDValue(); | |||
| 52862 | ||||
| 52863 | // SSSE3's pshufb results in less instructions in the cases below. | |||
| 52864 | if (Subtarget.hasSSSE3() && NumElems == 8) { | |||
| 52865 | if (InSVT == MVT::i16) | |||
| 52866 | return SDValue(); | |||
| 52867 | if (InSVT == MVT::i32 && | |||
| 52868 | (OutSVT == MVT::i8 || !Subtarget.hasSSE41() || Subtarget.hasInt256())) | |||
| 52869 | return SDValue(); | |||
| 52870 | } | |||
| 52871 | ||||
| 52872 | SDLoc DL(N); | |||
| 52873 | // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS | |||
| 52874 | // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to | |||
| 52875 | // truncate 2 x v4i32 to v8i16. | |||
| 52876 | if (Subtarget.hasSSE41() || OutSVT == MVT::i8) | |||
| 52877 | return combineVectorTruncationWithPACKUS(N, DL, Subtarget, DAG); | |||
| 52878 | if (InSVT == MVT::i32) | |||
| 52879 | return combineVectorTruncationWithPACKSS(N, DL, Subtarget, DAG); | |||
| 52880 | ||||
| 52881 | return SDValue(); | |||
| 52882 | } | |||
| 52883 | ||||
| 52884 | /// This function transforms vector truncation of 'extended sign-bits' or | |||
| 52885 | /// 'extended zero-bits' values. | |||
| 52886 | /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. | |||
| 52887 | static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL, | |||
| 52888 | SelectionDAG &DAG, | |||
| 52889 | const X86Subtarget &Subtarget) { | |||
| 52890 | // Requires SSE2. | |||
| 52891 | if (!Subtarget.hasSSE2()) | |||
| 52892 | return SDValue(); | |||
| 52893 | ||||
| 52894 | if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple()) | |||
| 52895 | return SDValue(); | |||
| 52896 | ||||
| 52897 | SDValue In = N->getOperand(0); | |||
| 52898 | if (!In.getValueType().isSimple()) | |||
| 52899 | return SDValue(); | |||
| 52900 | ||||
| 52901 | MVT VT = N->getValueType(0).getSimpleVT(); | |||
| 52902 | MVT SVT = VT.getScalarType(); | |||
| 52903 | ||||
| 52904 | MVT InVT = In.getValueType().getSimpleVT(); | |||
| 52905 | MVT InSVT = InVT.getScalarType(); | |||
| 52906 | ||||
| 52907 | // Check we have a truncation suited for PACKSS/PACKUS. | |||
| 52908 | if (!isPowerOf2_32(VT.getVectorNumElements())) | |||
| 52909 | return SDValue(); | |||
| 52910 | if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32) | |||
| 52911 | return SDValue(); | |||
| 52912 | if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) | |||
| 52913 | return SDValue(); | |||
| 52914 | ||||
| 52915 | // Truncation to sub-128bit vXi32 can be better handled with shuffles. | |||
| 52916 | if (SVT == MVT::i32 && VT.getSizeInBits() < 128) | |||
| 52917 | return SDValue(); | |||
| 52918 | ||||
| 52919 | // AVX512 has fast truncate, but if the input is already going to be split, | |||
| 52920 | // there's no harm in trying pack. | |||
| 52921 | if (Subtarget.hasAVX512() && | |||
| 52922 | !(!Subtarget.useAVX512Regs() && VT.is256BitVector() && | |||
| 52923 | InVT.is512BitVector())) { | |||
| 52924 | // PACK should still be worth it for 128-bit vectors if the sources were | |||
| 52925 | // originally concatenated from subvectors. | |||
| 52926 | SmallVector<SDValue> ConcatOps; | |||
| 52927 | if (VT.getSizeInBits() > 128 || | |||
| 52928 | !collectConcatOps(In.getNode(), ConcatOps, DAG)) | |||
| 52929 | return SDValue(); | |||
| 52930 | } | |||
| 52931 | ||||
| 52932 | unsigned NumPackedSignBits = std::min<unsigned>(SVT.getSizeInBits(), 16); | |||
| 52933 | unsigned NumPackedZeroBits = Subtarget.hasSSE41() ? NumPackedSignBits : 8; | |||
| 52934 | ||||
| 52935 | // Use PACKUS if the input has zero-bits that extend all the way to the | |||
| 52936 | // packed/truncated value. e.g. masks, zext_in_reg, etc. | |||
| 52937 | KnownBits Known = DAG.computeKnownBits(In); | |||
| 52938 | unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); | |||
| 52939 | if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedZeroBits)) | |||
| 52940 | return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); | |||
| 52941 | ||||
| 52942 | // Use PACKSS if the input has sign-bits that extend all the way to the | |||
| 52943 | // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. | |||
| 52944 | unsigned NumSignBits = DAG.ComputeNumSignBits(In); | |||
| 52945 | ||||
| 52946 | // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with | |||
| 52947 | // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later | |||
| 52948 | // on and combines/simplifications can't then use it. | |||
| 52949 | if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits()) | |||
| 52950 | return SDValue(); | |||
| 52951 | ||||
| 52952 | unsigned MinSignBits = InSVT.getSizeInBits() - NumPackedSignBits; | |||
| 52953 | if (NumSignBits > MinSignBits) | |||
| 52954 | return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); | |||
| 52955 | ||||
| 52956 | // If we have a srl that only generates signbits that we will discard in | |||
| 52957 | // the truncation then we can use PACKSS by converting the srl to a sra. | |||
| 52958 | // SimplifyDemandedBits often relaxes sra to srl so we need to reverse it. | |||
| 52959 | if (In.getOpcode() == ISD::SRL && N->isOnlyUserOf(In.getNode())) | |||
| 52960 | if (const APInt *ShAmt = DAG.getValidShiftAmountConstant( | |||
| 52961 | In, APInt::getAllOnes(VT.getVectorNumElements()))) { | |||
| 52962 | if (*ShAmt == MinSignBits) { | |||
| 52963 | SDValue NewIn = DAG.getNode(ISD::SRA, DL, InVT, In->ops()); | |||
| 52964 | return truncateVectorWithPACK(X86ISD::PACKSS, VT, NewIn, DL, DAG, | |||
| 52965 | Subtarget); | |||
| 52966 | } | |||
| 52967 | } | |||
| 52968 | ||||
| 52969 | return SDValue(); | |||
| 52970 | } | |||
| 52971 | ||||
| 52972 | // Try to form a MULHU or MULHS node by looking for | |||
| 52973 | // (trunc (srl (mul ext, ext), 16)) | |||
| 52974 | // TODO: This is X86 specific because we want to be able to handle wide types | |||
| 52975 | // before type legalization. But we can only do it if the vector will be | |||
| 52976 | // legalized via widening/splitting. Type legalization can't handle promotion | |||
| 52977 | // of a MULHU/MULHS. There isn't a way to convey this to the generic DAG | |||
| 52978 | // combiner. | |||
| 52979 | static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL, | |||
| 52980 | SelectionDAG &DAG, const X86Subtarget &Subtarget) { | |||
| 52981 | // First instruction should be a right shift of a multiply. | |||
| 52982 | if (Src.getOpcode() != ISD::SRL || | |||
| 52983 | Src.getOperand(0).getOpcode() != ISD::MUL) | |||
| 52984 | return SDValue(); | |||
| 52985 | ||||
| 52986 | if (!Subtarget.hasSSE2()) | |||
| 52987 | return SDValue(); | |||
| 52988 | ||||
| 52989 | // Only handle vXi16 types that are at least 128-bits unless they will be | |||
| 52990 | // widened. | |||
| 52991 | if (!VT.isVector() || VT.getVectorElementType() != MVT::i16) | |||
| 52992 | return SDValue(); | |||
| 52993 | ||||
| 52994 | // Input type should be at least vXi32. | |||
| 52995 | EVT InVT = Src.getValueType(); | |||
| 52996 | if (InVT.getVectorElementType().getSizeInBits() < 32) | |||
| 52997 | return SDValue(); | |||
| 52998 | ||||
| 52999 | // Need a shift by 16. | |||
| 53000 | APInt ShiftAmt; | |||
| 53001 | if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) || | |||
| 53002 | ShiftAmt != 16) | |||
| 53003 | return SDValue(); | |||
| 53004 | ||||
| 53005 | SDValue LHS = Src.getOperand(0).getOperand(0); | |||
| 53006 | SDValue RHS = Src.getOperand(0).getOperand(1); | |||
| 53007 | ||||
| 53008 | // Count leading sign/zero bits on both inputs - if there are enough then | |||
| 53009 | // truncation back to vXi16 will be cheap - either as a pack/shuffle | |||
| 53010 | // sequence or using AVX512 truncations. If the inputs are sext/zext then the | |||
| 53011 | // truncations may actually be free by peeking through to the ext source. | |||
| 53012 | auto IsSext = [&DAG](SDValue V) { | |||
| 53013 | return DAG.ComputeMaxSignificantBits(V) <= 16; | |||
| 53014 | }; | |||
| 53015 | auto IsZext = [&DAG](SDValue V) { | |||
| 53016 | return DAG.computeKnownBits(V).countMaxActiveBits() <= 16; | |||
| 53017 | }; | |||
| 53018 | ||||
| 53019 | bool IsSigned = IsSext(LHS) && IsSext(RHS); | |||
| 53020 | bool IsUnsigned = IsZext(LHS) && IsZext(RHS); | |||
| 53021 | if (!IsSigned && !IsUnsigned) | |||
| 53022 | return SDValue(); | |||
| 53023 | ||||
| 53024 | // Check if both inputs are extensions, which will be removed by truncation. | |||
| 53025 | bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND || | |||
| 53026 | LHS.getOpcode() == ISD::ZERO_EXTEND) && | |||
| 53027 | (RHS.getOpcode() == ISD::SIGN_EXTEND || | |||
| 53028 | RHS.getOpcode() == ISD::ZERO_EXTEND) && | |||
| 53029 | LHS.getOperand(0).getScalarValueSizeInBits() <= 16 && | |||
| 53030 | RHS.getOperand(0).getScalarValueSizeInBits() <= 16; | |||
| 53031 | ||||
| 53032 | // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on | |||
| 53033 | // the (bitcasted) inputs directly, and then cheaply pack/truncate the result | |||
| 53034 | // (upper elts will be zero). Don't attempt this with just AVX512F as MULHU | |||
| 53035 | // will have to split anyway. | |||
| 53036 | unsigned InSizeInBits = InVT.getSizeInBits(); | |||
| 53037 | if (IsUnsigned && !IsTruncateFree && Subtarget.hasInt256() && | |||
| 53038 | !(Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.is256BitVector()) && | |||
| 53039 | (InSizeInBits % 16) == 0) { | |||
| 53040 | EVT BCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, | |||
| 53041 | InVT.getSizeInBits() / 16); | |||
| 53042 | SDValue Res = DAG.getNode(ISD::MULHU, DL, BCVT, DAG.getBitcast(BCVT, LHS), | |||
| 53043 | DAG.getBitcast(BCVT, RHS)); | |||
| 53044 | return DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getBitcast(InVT, Res)); | |||
| 53045 | } | |||
| 53046 | ||||
| 53047 | // Truncate back to source type. | |||
| 53048 | LHS = DAG.getNode(ISD::TRUNCATE, DL, VT, LHS); | |||
| 53049 | RHS = DAG.getNode(ISD::TRUNCATE, DL, VT, RHS); | |||
| 53050 | ||||
| 53051 | unsigned Opc = IsSigned ? ISD::MULHS : ISD::MULHU; | |||
| 53052 | return DAG.getNode(Opc, DL, VT, LHS, RHS); | |||
| 53053 | } | |||
| 53054 | ||||
| 53055 | // Attempt to match PMADDUBSW, which multiplies corresponding unsigned bytes | |||
| 53056 | // from one vector with signed bytes from another vector, adds together | |||
| 53057 | // adjacent pairs of 16-bit products, and saturates the result before | |||
| 53058 | // truncating to 16-bits. | |||
| 53059 | // | |||
| 53060 | // Which looks something like this: | |||
| 53061 | // (i16 (ssat (add (mul (zext (even elts (i8 A))), (sext (even elts (i8 B)))), | |||
| 53062 | // (mul (zext (odd elts (i8 A)), (sext (odd elts (i8 B)))))))) | |||
| 53063 | static SDValue detectPMADDUBSW(SDValue In, EVT VT, SelectionDAG &DAG, | |||
| 53064 | const X86Subtarget &Subtarget, | |||
| 53065 | const SDLoc &DL) { | |||
| 53066 | if (!VT.isVector() || !Subtarget.hasSSSE3()) | |||
| 53067 | return SDValue(); | |||
| 53068 | ||||
| 53069 | unsigned NumElems = VT.getVectorNumElements(); | |||
| 53070 | EVT ScalarVT = VT.getVectorElementType(); | |||
| 53071 | if (ScalarVT != MVT::i16 || NumElems < 8 || !isPowerOf2_32(NumElems)) | |||
| 53072 | return SDValue(); | |||
| 53073 | ||||
| 53074 | SDValue SSatVal = detectSSatPattern(In, VT); | |||
| 53075 | if (!SSatVal || SSatVal.getOpcode() != ISD::ADD) | |||
| 53076 | return SDValue(); | |||
| 53077 | ||||
| 53078 | // Ok this is a signed saturation of an ADD. See if this ADD is adding pairs | |||
| 53079 | // of multiplies from even/odd elements. | |||
| 53080 | SDValue N0 = SSatVal.getOperand(0); | |||
| 53081 | SDValue N1 = SSatVal.getOperand(1); | |||
| 53082 | ||||
| 53083 | if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) | |||
| 53084 | return SDValue(); | |||
| 53085 | ||||
| 53086 | SDValue N00 = N0.getOperand(0); | |||
| 53087 | SDValue N01 = N0.getOperand(1); | |||
| 53088 | SDValue N10 = N1.getOperand(0); | |||
| 53089 | SDValue N11 = N1.getOperand(1); | |||
| 53090 | ||||
| 53091 | // TODO: Handle constant vectors and use knownbits/computenumsignbits? | |||
| 53092 | // Canonicalize zero_extend to LHS. | |||
| 53093 | if (N01.getOpcode() == ISD::ZERO_EXTEND) | |||
| 53094 | std::swap(N00, N01); | |||
| 53095 | if (N11.getOpcode() == ISD::ZERO_EXTEND) | |||
| 53096 | std::swap(N10, N11); | |||
| 53097 | ||||
| 53098 | // Ensure we have a zero_extend and a sign_extend. | |||
| 53099 | if (N00.getOpcode() != ISD::ZERO_EXTEND || | |||
| 53100 | N01.getOpcode() != ISD::SIGN_EXTEND || | |||
| 53101 | N10.getOpcode() != ISD::ZERO_EXTEND || | |||
| 53102 | N11.getOpcode() != ISD::SIGN_EXTEND) | |||
| 53103 | return SDValue(); | |||
| 53104 | ||||
| 53105 | // Peek through the extends. | |||
| 53106 | N00 = N00.getOperand(0); | |||
| 53107 | N01 = N01.getOperand(0); | |||
| 53108 | N10 = N10.getOperand(0); | |||
| 53109 | N11 = N11.getOperand(0); | |||
| 53110 | ||||
| 53111 | // Ensure the extend is from vXi8. | |||
| 53112 | if (N00.getValueType().getVectorElementType() != MVT::i8 || | |||
| 53113 | N01.getValueType().getVectorElementType() != MVT::i8 || | |||
| 53114 | N10.getValueType().getVectorElementType() != MVT::i8 || | |||
| 53115 | N11.getValueType().getVectorElementType() != MVT::i8) | |||
| 53116 | return SDValue(); | |||
| 53117 | ||||
| 53118 | // All inputs should be build_vectors. | |||
| 53119 | if (N00.getOpcode() != ISD::BUILD_VECTOR || | |||
| 53120 | N01.getOpcode() != ISD::BUILD_VECTOR || | |||
| 53121 | N10.getOpcode() != ISD::BUILD_VECTOR || | |||
| 53122 | N11.getOpcode() != ISD::BUILD_VECTOR) | |||
| 53123 | return SDValue(); | |||
| 53124 | ||||
| 53125 | // N00/N10 are zero extended. N01/N11 are sign extended. | |||
| 53126 | ||||
| 53127 | // For each element, we need to ensure we have an odd element from one vector | |||
| 53128 | // multiplied by the odd element of another vector and the even element from | |||
| 53129 | // one of the same vectors being multiplied by the even element from the | |||
| 53130 | // other vector. So we need to make sure for each element i, this operator | |||
| 53131 | // is being performed: | |||
| 53132 | // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] | |||
| 53133 | SDValue ZExtIn, SExtIn; | |||
| 53134 | for (unsigned i = 0; i != NumElems; ++i) { | |||
| 53135 | SDValue N00Elt = N00.getOperand(i); | |||
| 53136 | SDValue N01Elt = N01.getOperand(i); | |||
| 53137 | SDValue N10Elt = N10.getOperand(i); | |||
| 53138 | SDValue N11Elt = N11.getOperand(i); | |||
| 53139 | // TODO: Be more tolerant to undefs. | |||
| 53140 | if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 53141 | N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 53142 | N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 53143 | N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 53144 | return SDValue(); | |||
| 53145 | auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1)); | |||
| 53146 | auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1)); | |||
| 53147 | auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1)); | |||
| 53148 | auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1)); | |||
| 53149 | if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) | |||
| 53150 | return SDValue(); | |||
| 53151 | unsigned IdxN00 = ConstN00Elt->getZExtValue(); | |||
| 53152 | unsigned IdxN01 = ConstN01Elt->getZExtValue(); | |||
| 53153 | unsigned IdxN10 = ConstN10Elt->getZExtValue(); | |||
| 53154 | unsigned IdxN11 = ConstN11Elt->getZExtValue(); | |||
| 53155 | // Add is commutative so indices can be reordered. | |||
| 53156 | if (IdxN00 > IdxN10) { | |||
| 53157 | std::swap(IdxN00, IdxN10); | |||
| 53158 | std::swap(IdxN01, IdxN11); | |||
| 53159 | } | |||
| 53160 | // N0 indices be the even element. N1 indices must be the next odd element. | |||
| 53161 | if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || | |||
| 53162 | IdxN01 != 2 * i || IdxN11 != 2 * i + 1) | |||
| 53163 | return SDValue(); | |||
| 53164 | SDValue N00In = N00Elt.getOperand(0); | |||
| 53165 | SDValue N01In = N01Elt.getOperand(0); | |||
| 53166 | SDValue N10In = N10Elt.getOperand(0); | |||
| 53167 | SDValue N11In = N11Elt.getOperand(0); | |||
| 53168 | // First time we find an input capture it. | |||
| 53169 | if (!ZExtIn) { | |||
| 53170 | ZExtIn = N00In; | |||
| 53171 | SExtIn = N01In; | |||
| 53172 | } | |||
| 53173 | if (ZExtIn != N00In || SExtIn != N01In || | |||
| 53174 | ZExtIn != N10In || SExtIn != N11In) | |||
| 53175 | return SDValue(); | |||
| 53176 | } | |||
| 53177 | ||||
| 53178 | auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 53179 | ArrayRef<SDValue> Ops) { | |||
| 53180 | // Shrink by adding truncate nodes and let DAGCombine fold with the | |||
| 53181 | // sources. | |||
| 53182 | EVT InVT = Ops[0].getValueType(); | |||
| 53183 | assert(InVT.getScalarType() == MVT::i8 &&(static_cast <bool> (InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type") ? void (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__ __PRETTY_FUNCTION__)) | |||
| 53184 | "Unexpected scalar element type")(static_cast <bool> (InVT.getScalarType() == MVT::i8 && "Unexpected scalar element type") ? void (0) : __assert_fail ("InVT.getScalarType() == MVT::i8 && \"Unexpected scalar element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53184, __extension__ __PRETTY_FUNCTION__)); | |||
| 53185 | assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() && "Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53185, __extension__ __PRETTY_FUNCTION__)); | |||
| 53186 | EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, | |||
| 53187 | InVT.getVectorNumElements() / 2); | |||
| 53188 | return DAG.getNode(X86ISD::VPMADDUBSW, DL, ResVT, Ops[0], Ops[1]); | |||
| 53189 | }; | |||
| 53190 | return SplitOpsAndApply(DAG, Subtarget, DL, VT, { ZExtIn, SExtIn }, | |||
| 53191 | PMADDBuilder); | |||
| 53192 | } | |||
| 53193 | ||||
| 53194 | static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, | |||
| 53195 | const X86Subtarget &Subtarget) { | |||
| 53196 | EVT VT = N->getValueType(0); | |||
| 53197 | SDValue Src = N->getOperand(0); | |||
| 53198 | SDLoc DL(N); | |||
| 53199 | ||||
| 53200 | // Attempt to pre-truncate inputs to arithmetic ops instead. | |||
| 53201 | if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL)) | |||
| 53202 | return V; | |||
| 53203 | ||||
| 53204 | // Try to detect AVG pattern first. | |||
| 53205 | if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL)) | |||
| 53206 | return Avg; | |||
| 53207 | ||||
| 53208 | // Try to detect PMADD | |||
| 53209 | if (SDValue PMAdd = detectPMADDUBSW(Src, VT, DAG, Subtarget, DL)) | |||
| 53210 | return PMAdd; | |||
| 53211 | ||||
| 53212 | // Try to combine truncation with signed/unsigned saturation. | |||
| 53213 | if (SDValue Val = combineTruncateWithSat(Src, VT, DL, DAG, Subtarget)) | |||
| 53214 | return Val; | |||
| 53215 | ||||
| 53216 | // Try to combine PMULHUW/PMULHW for vXi16. | |||
| 53217 | if (SDValue V = combinePMULH(Src, VT, DL, DAG, Subtarget)) | |||
| 53218 | return V; | |||
| 53219 | ||||
| 53220 | // The bitcast source is a direct mmx result. | |||
| 53221 | // Detect bitcasts between i32 to x86mmx | |||
| 53222 | if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) { | |||
| 53223 | SDValue BCSrc = Src.getOperand(0); | |||
| 53224 | if (BCSrc.getValueType() == MVT::x86mmx) | |||
| 53225 | return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); | |||
| 53226 | } | |||
| 53227 | ||||
| 53228 | // Try to truncate extended sign/zero bits with PACKSS/PACKUS. | |||
| 53229 | if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) | |||
| 53230 | return V; | |||
| 53231 | ||||
| 53232 | return combineVectorTruncation(N, DAG, Subtarget); | |||
| 53233 | } | |||
| 53234 | ||||
| 53235 | static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG, | |||
| 53236 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 53237 | EVT VT = N->getValueType(0); | |||
| 53238 | SDValue In = N->getOperand(0); | |||
| 53239 | SDLoc DL(N); | |||
| 53240 | ||||
| 53241 | if (SDValue SSatVal = detectSSatPattern(In, VT)) | |||
| 53242 | return DAG.getNode(X86ISD::VTRUNCS, DL, VT, SSatVal); | |||
| 53243 | if (SDValue USatVal = detectUSatPattern(In, VT, DAG, DL)) | |||
| 53244 | return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal); | |||
| 53245 | ||||
| 53246 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53247 | APInt DemandedMask(APInt::getAllOnes(VT.getScalarSizeInBits())); | |||
| 53248 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) | |||
| 53249 | return SDValue(N, 0); | |||
| 53250 | ||||
| 53251 | return SDValue(); | |||
| 53252 | } | |||
| 53253 | ||||
| 53254 | /// Returns the negated value if the node \p N flips sign of FP value. | |||
| 53255 | /// | |||
| 53256 | /// FP-negation node may have different forms: FNEG(x), FXOR (x, 0x80000000) | |||
| 53257 | /// or FSUB(0, x) | |||
| 53258 | /// AVX512F does not have FXOR, so FNEG is lowered as | |||
| 53259 | /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))). | |||
| 53260 | /// In this case we go though all bitcasts. | |||
| 53261 | /// This also recognizes splat of a negated value and returns the splat of that | |||
| 53262 | /// value. | |||
| 53263 | static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) { | |||
| 53264 | if (N->getOpcode() == ISD::FNEG) | |||
| 53265 | return N->getOperand(0); | |||
| 53266 | ||||
| 53267 | // Don't recurse exponentially. | |||
| 53268 | if (Depth > SelectionDAG::MaxRecursionDepth) | |||
| 53269 | return SDValue(); | |||
| 53270 | ||||
| 53271 | unsigned ScalarSize = N->getValueType(0).getScalarSizeInBits(); | |||
| 53272 | ||||
| 53273 | SDValue Op = peekThroughBitcasts(SDValue(N, 0)); | |||
| 53274 | EVT VT = Op->getValueType(0); | |||
| 53275 | ||||
| 53276 | // Make sure the element size doesn't change. | |||
| 53277 | if (VT.getScalarSizeInBits() != ScalarSize) | |||
| 53278 | return SDValue(); | |||
| 53279 | ||||
| 53280 | unsigned Opc = Op.getOpcode(); | |||
| 53281 | switch (Opc) { | |||
| 53282 | case ISD::VECTOR_SHUFFLE: { | |||
| 53283 | // For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate | |||
| 53284 | // of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here. | |||
| 53285 | if (!Op.getOperand(1).isUndef()) | |||
| 53286 | return SDValue(); | |||
| 53287 | if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1)) | |||
| 53288 | if (NegOp0.getValueType() == VT) // FIXME: Can we do better? | |||
| 53289 | return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT), | |||
| 53290 | cast<ShuffleVectorSDNode>(Op)->getMask()); | |||
| 53291 | break; | |||
| 53292 | } | |||
| 53293 | case ISD::INSERT_VECTOR_ELT: { | |||
| 53294 | // Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF, | |||
| 53295 | // -V, INDEX). | |||
| 53296 | SDValue InsVector = Op.getOperand(0); | |||
| 53297 | SDValue InsVal = Op.getOperand(1); | |||
| 53298 | if (!InsVector.isUndef()) | |||
| 53299 | return SDValue(); | |||
| 53300 | if (SDValue NegInsVal = isFNEG(DAG, InsVal.getNode(), Depth + 1)) | |||
| 53301 | if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME | |||
| 53302 | return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector, | |||
| 53303 | NegInsVal, Op.getOperand(2)); | |||
| 53304 | break; | |||
| 53305 | } | |||
| 53306 | case ISD::FSUB: | |||
| 53307 | case ISD::XOR: | |||
| 53308 | case X86ISD::FXOR: { | |||
| 53309 | SDValue Op1 = Op.getOperand(1); | |||
| 53310 | SDValue Op0 = Op.getOperand(0); | |||
| 53311 | ||||
| 53312 | // For XOR and FXOR, we want to check if constant | |||
| 53313 | // bits of Op1 are sign bit masks. For FSUB, we | |||
| 53314 | // have to check if constant bits of Op0 are sign | |||
| 53315 | // bit masks and hence we swap the operands. | |||
| 53316 | if (Opc == ISD::FSUB) | |||
| 53317 | std::swap(Op0, Op1); | |||
| 53318 | ||||
| 53319 | APInt UndefElts; | |||
| 53320 | SmallVector<APInt, 16> EltBits; | |||
| 53321 | // Extract constant bits and see if they are all | |||
| 53322 | // sign bit masks. Ignore the undef elements. | |||
| 53323 | if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits, | |||
| 53324 | /* AllowWholeUndefs */ true, | |||
| 53325 | /* AllowPartialUndefs */ false)) { | |||
| 53326 | for (unsigned I = 0, E = EltBits.size(); I < E; I++) | |||
| 53327 | if (!UndefElts[I] && !EltBits[I].isSignMask()) | |||
| 53328 | return SDValue(); | |||
| 53329 | ||||
| 53330 | // Only allow bitcast from correctly-sized constant. | |||
| 53331 | Op0 = peekThroughBitcasts(Op0); | |||
| 53332 | if (Op0.getScalarValueSizeInBits() == ScalarSize) | |||
| 53333 | return Op0; | |||
| 53334 | } | |||
| 53335 | break; | |||
| 53336 | } // case | |||
| 53337 | } // switch | |||
| 53338 | ||||
| 53339 | return SDValue(); | |||
| 53340 | } | |||
| 53341 | ||||
| 53342 | static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc, | |||
| 53343 | bool NegRes) { | |||
| 53344 | if (NegMul) { | |||
| 53345 | switch (Opcode) { | |||
| 53346 | default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 53346); | |||
| 53347 | case ISD::FMA: Opcode = X86ISD::FNMADD; break; | |||
| 53348 | case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FNMADD; break; | |||
| 53349 | case X86ISD::FMADD_RND: Opcode = X86ISD::FNMADD_RND; break; | |||
| 53350 | case X86ISD::FMSUB: Opcode = X86ISD::FNMSUB; break; | |||
| 53351 | case X86ISD::STRICT_FMSUB: Opcode = X86ISD::STRICT_FNMSUB; break; | |||
| 53352 | case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMSUB_RND; break; | |||
| 53353 | case X86ISD::FNMADD: Opcode = ISD::FMA; break; | |||
| 53354 | case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA; break; | |||
| 53355 | case X86ISD::FNMADD_RND: Opcode = X86ISD::FMADD_RND; break; | |||
| 53356 | case X86ISD::FNMSUB: Opcode = X86ISD::FMSUB; break; | |||
| 53357 | case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB; break; | |||
| 53358 | case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMSUB_RND; break; | |||
| 53359 | } | |||
| 53360 | } | |||
| 53361 | ||||
| 53362 | if (NegAcc) { | |||
| 53363 | switch (Opcode) { | |||
| 53364 | default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 53364); | |||
| 53365 | case ISD::FMA: Opcode = X86ISD::FMSUB; break; | |||
| 53366 | case ISD::STRICT_FMA: Opcode = X86ISD::STRICT_FMSUB; break; | |||
| 53367 | case X86ISD::FMADD_RND: Opcode = X86ISD::FMSUB_RND; break; | |||
| 53368 | case X86ISD::FMSUB: Opcode = ISD::FMA; break; | |||
| 53369 | case X86ISD::STRICT_FMSUB: Opcode = ISD::STRICT_FMA; break; | |||
| 53370 | case X86ISD::FMSUB_RND: Opcode = X86ISD::FMADD_RND; break; | |||
| 53371 | case X86ISD::FNMADD: Opcode = X86ISD::FNMSUB; break; | |||
| 53372 | case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break; | |||
| 53373 | case X86ISD::FNMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; | |||
| 53374 | case X86ISD::FNMSUB: Opcode = X86ISD::FNMADD; break; | |||
| 53375 | case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break; | |||
| 53376 | case X86ISD::FNMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; | |||
| 53377 | case X86ISD::FMADDSUB: Opcode = X86ISD::FMSUBADD; break; | |||
| 53378 | case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break; | |||
| 53379 | case X86ISD::FMSUBADD: Opcode = X86ISD::FMADDSUB; break; | |||
| 53380 | case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break; | |||
| 53381 | } | |||
| 53382 | } | |||
| 53383 | ||||
| 53384 | if (NegRes) { | |||
| 53385 | switch (Opcode) { | |||
| 53386 | // For accuracy reason, we never combine fneg and fma under strict FP. | |||
| 53387 | default: llvm_unreachable("Unexpected opcode")::llvm::llvm_unreachable_internal("Unexpected opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 53387); | |||
| 53388 | case ISD::FMA: Opcode = X86ISD::FNMSUB; break; | |||
| 53389 | case X86ISD::FMADD_RND: Opcode = X86ISD::FNMSUB_RND; break; | |||
| 53390 | case X86ISD::FMSUB: Opcode = X86ISD::FNMADD; break; | |||
| 53391 | case X86ISD::FMSUB_RND: Opcode = X86ISD::FNMADD_RND; break; | |||
| 53392 | case X86ISD::FNMADD: Opcode = X86ISD::FMSUB; break; | |||
| 53393 | case X86ISD::FNMADD_RND: Opcode = X86ISD::FMSUB_RND; break; | |||
| 53394 | case X86ISD::FNMSUB: Opcode = ISD::FMA; break; | |||
| 53395 | case X86ISD::FNMSUB_RND: Opcode = X86ISD::FMADD_RND; break; | |||
| 53396 | } | |||
| 53397 | } | |||
| 53398 | ||||
| 53399 | return Opcode; | |||
| 53400 | } | |||
| 53401 | ||||
| 53402 | /// Do target-specific dag combines on floating point negations. | |||
| 53403 | static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, | |||
| 53404 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 53405 | const X86Subtarget &Subtarget) { | |||
| 53406 | EVT OrigVT = N->getValueType(0); | |||
| 53407 | SDValue Arg = isFNEG(DAG, N); | |||
| 53408 | if (!Arg) | |||
| 53409 | return SDValue(); | |||
| 53410 | ||||
| 53411 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53412 | EVT VT = Arg.getValueType(); | |||
| 53413 | EVT SVT = VT.getScalarType(); | |||
| 53414 | SDLoc DL(N); | |||
| 53415 | ||||
| 53416 | // Let legalize expand this if it isn't a legal type yet. | |||
| 53417 | if (!TLI.isTypeLegal(VT)) | |||
| 53418 | return SDValue(); | |||
| 53419 | ||||
| 53420 | // If we're negating a FMUL node on a target with FMA, then we can avoid the | |||
| 53421 | // use of a constant by performing (-0 - A*B) instead. | |||
| 53422 | // FIXME: Check rounding control flags as well once it becomes available. | |||
| 53423 | if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) && | |||
| 53424 | Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) { | |||
| 53425 | SDValue Zero = DAG.getConstantFP(0.0, DL, VT); | |||
| 53426 | SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0), | |||
| 53427 | Arg.getOperand(1), Zero); | |||
| 53428 | return DAG.getBitcast(OrigVT, NewNode); | |||
| 53429 | } | |||
| 53430 | ||||
| 53431 | bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); | |||
| 53432 | bool LegalOperations = !DCI.isBeforeLegalizeOps(); | |||
| 53433 | if (SDValue NegArg = | |||
| 53434 | TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize)) | |||
| 53435 | return DAG.getBitcast(OrigVT, NegArg); | |||
| 53436 | ||||
| 53437 | return SDValue(); | |||
| 53438 | } | |||
| 53439 | ||||
| 53440 | SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG, | |||
| 53441 | bool LegalOperations, | |||
| 53442 | bool ForCodeSize, | |||
| 53443 | NegatibleCost &Cost, | |||
| 53444 | unsigned Depth) const { | |||
| 53445 | // fneg patterns are removable even if they have multiple uses. | |||
| 53446 | if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) { | |||
| 53447 | Cost = NegatibleCost::Cheaper; | |||
| 53448 | return DAG.getBitcast(Op.getValueType(), Arg); | |||
| 53449 | } | |||
| 53450 | ||||
| 53451 | EVT VT = Op.getValueType(); | |||
| 53452 | EVT SVT = VT.getScalarType(); | |||
| 53453 | unsigned Opc = Op.getOpcode(); | |||
| 53454 | SDNodeFlags Flags = Op.getNode()->getFlags(); | |||
| 53455 | switch (Opc) { | |||
| 53456 | case ISD::FMA: | |||
| 53457 | case X86ISD::FMSUB: | |||
| 53458 | case X86ISD::FNMADD: | |||
| 53459 | case X86ISD::FNMSUB: | |||
| 53460 | case X86ISD::FMADD_RND: | |||
| 53461 | case X86ISD::FMSUB_RND: | |||
| 53462 | case X86ISD::FNMADD_RND: | |||
| 53463 | case X86ISD::FNMSUB_RND: { | |||
| 53464 | if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) || | |||
| 53465 | !(SVT == MVT::f32 || SVT == MVT::f64) || | |||
| 53466 | !isOperationLegal(ISD::FMA, VT)) | |||
| 53467 | break; | |||
| 53468 | ||||
| 53469 | // Don't fold (fneg (fma (fneg x), y, (fneg z))) to (fma x, y, z) | |||
| 53470 | // if it may have signed zeros. | |||
| 53471 | if (!Flags.hasNoSignedZeros()) | |||
| 53472 | break; | |||
| 53473 | ||||
| 53474 | // This is always negatible for free but we might be able to remove some | |||
| 53475 | // extra operand negations as well. | |||
| 53476 | SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue()); | |||
| 53477 | for (int i = 0; i != 3; ++i) | |||
| 53478 | NewOps[i] = getCheaperNegatedExpression( | |||
| 53479 | Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1); | |||
| 53480 | ||||
| 53481 | bool NegA = !!NewOps[0]; | |||
| 53482 | bool NegB = !!NewOps[1]; | |||
| 53483 | bool NegC = !!NewOps[2]; | |||
| 53484 | unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true); | |||
| 53485 | ||||
| 53486 | Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper | |||
| 53487 | : NegatibleCost::Neutral; | |||
| 53488 | ||||
| 53489 | // Fill in the non-negated ops with the original values. | |||
| 53490 | for (int i = 0, e = Op.getNumOperands(); i != e; ++i) | |||
| 53491 | if (!NewOps[i]) | |||
| 53492 | NewOps[i] = Op.getOperand(i); | |||
| 53493 | return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps); | |||
| 53494 | } | |||
| 53495 | case X86ISD::FRCP: | |||
| 53496 | if (SDValue NegOp0 = | |||
| 53497 | getNegatedExpression(Op.getOperand(0), DAG, LegalOperations, | |||
| 53498 | ForCodeSize, Cost, Depth + 1)) | |||
| 53499 | return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0); | |||
| 53500 | break; | |||
| 53501 | } | |||
| 53502 | ||||
| 53503 | return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations, | |||
| 53504 | ForCodeSize, Cost, Depth); | |||
| 53505 | } | |||
| 53506 | ||||
| 53507 | static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, | |||
| 53508 | const X86Subtarget &Subtarget) { | |||
| 53509 | MVT VT = N->getSimpleValueType(0); | |||
| 53510 | // If we have integer vector types available, use the integer opcodes. | |||
| 53511 | if (!VT.isVector() || !Subtarget.hasSSE2()) | |||
| 53512 | return SDValue(); | |||
| 53513 | ||||
| 53514 | SDLoc dl(N); | |||
| 53515 | ||||
| 53516 | unsigned IntBits = VT.getScalarSizeInBits(); | |||
| 53517 | MVT IntSVT = MVT::getIntegerVT(IntBits); | |||
| 53518 | MVT IntVT = MVT::getVectorVT(IntSVT, VT.getSizeInBits() / IntBits); | |||
| 53519 | ||||
| 53520 | SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0)); | |||
| 53521 | SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1)); | |||
| 53522 | unsigned IntOpcode; | |||
| 53523 | switch (N->getOpcode()) { | |||
| 53524 | default: llvm_unreachable("Unexpected FP logic op")::llvm::llvm_unreachable_internal("Unexpected FP logic op", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 53524); | |||
| 53525 | case X86ISD::FOR: IntOpcode = ISD::OR; break; | |||
| 53526 | case X86ISD::FXOR: IntOpcode = ISD::XOR; break; | |||
| 53527 | case X86ISD::FAND: IntOpcode = ISD::AND; break; | |||
| 53528 | case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break; | |||
| 53529 | } | |||
| 53530 | SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1); | |||
| 53531 | return DAG.getBitcast(VT, IntOp); | |||
| 53532 | } | |||
| 53533 | ||||
| 53534 | ||||
| 53535 | /// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val) | |||
| 53536 | static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { | |||
| 53537 | if (N->getOpcode() != ISD::XOR) | |||
| 53538 | return SDValue(); | |||
| 53539 | ||||
| 53540 | SDValue LHS = N->getOperand(0); | |||
| 53541 | if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC) | |||
| 53542 | return SDValue(); | |||
| 53543 | ||||
| 53544 | X86::CondCode NewCC = X86::GetOppositeBranchCondition( | |||
| 53545 | X86::CondCode(LHS->getConstantOperandVal(0))); | |||
| 53546 | SDLoc DL(N); | |||
| 53547 | return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); | |||
| 53548 | } | |||
| 53549 | ||||
| 53550 | static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG, | |||
| 53551 | const X86Subtarget &Subtarget) { | |||
| 53552 | assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&(static_cast <bool> ((N->getOpcode() == ISD::XOR || N ->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ" ) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__ __PRETTY_FUNCTION__)) | |||
| 53553 | "Invalid opcode for combing with CTLZ")(static_cast <bool> ((N->getOpcode() == ISD::XOR || N ->getOpcode() == ISD::SUB) && "Invalid opcode for combing with CTLZ" ) ? void (0) : __assert_fail ("(N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) && \"Invalid opcode for combing with CTLZ\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53553, __extension__ __PRETTY_FUNCTION__)); | |||
| 53554 | if (Subtarget.hasFastLZCNT()) | |||
| 53555 | return SDValue(); | |||
| 53556 | ||||
| 53557 | EVT VT = N->getValueType(0); | |||
| 53558 | if (VT != MVT::i8 && VT != MVT::i16 && VT != MVT::i32 && | |||
| 53559 | (VT != MVT::i64 || !Subtarget.is64Bit())) | |||
| 53560 | return SDValue(); | |||
| 53561 | ||||
| 53562 | SDValue N0 = N->getOperand(0); | |||
| 53563 | SDValue N1 = N->getOperand(1); | |||
| 53564 | ||||
| 53565 | if (N0.getOpcode() != ISD::CTLZ_ZERO_UNDEF && | |||
| 53566 | N1.getOpcode() != ISD::CTLZ_ZERO_UNDEF) | |||
| 53567 | return SDValue(); | |||
| 53568 | ||||
| 53569 | SDValue OpCTLZ; | |||
| 53570 | SDValue OpSizeTM1; | |||
| 53571 | ||||
| 53572 | if (N1.getOpcode() == ISD::CTLZ_ZERO_UNDEF) { | |||
| 53573 | OpCTLZ = N1; | |||
| 53574 | OpSizeTM1 = N0; | |||
| 53575 | } else if (N->getOpcode() == ISD::SUB) { | |||
| 53576 | return SDValue(); | |||
| 53577 | } else { | |||
| 53578 | OpCTLZ = N0; | |||
| 53579 | OpSizeTM1 = N1; | |||
| 53580 | } | |||
| 53581 | ||||
| 53582 | if (!OpCTLZ.hasOneUse()) | |||
| 53583 | return SDValue(); | |||
| 53584 | auto *C = dyn_cast<ConstantSDNode>(OpSizeTM1); | |||
| 53585 | if (!C) | |||
| 53586 | return SDValue(); | |||
| 53587 | ||||
| 53588 | if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1)) | |||
| 53589 | return SDValue(); | |||
| 53590 | SDLoc DL(N); | |||
| 53591 | EVT OpVT = VT; | |||
| 53592 | SDValue Op = OpCTLZ.getOperand(0); | |||
| 53593 | if (VT == MVT::i8) { | |||
| 53594 | // Zero extend to i32 since there is not an i8 bsr. | |||
| 53595 | OpVT = MVT::i32; | |||
| 53596 | Op = DAG.getNode(ISD::ZERO_EXTEND, DL, OpVT, Op); | |||
| 53597 | } | |||
| 53598 | ||||
| 53599 | SDVTList VTs = DAG.getVTList(OpVT, MVT::i32); | |||
| 53600 | Op = DAG.getNode(X86ISD::BSR, DL, VTs, Op); | |||
| 53601 | if (VT == MVT::i8) | |||
| 53602 | Op = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Op); | |||
| 53603 | ||||
| 53604 | return Op; | |||
| 53605 | } | |||
| 53606 | ||||
| 53607 | static SDValue combineXor(SDNode *N, SelectionDAG &DAG, | |||
| 53608 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 53609 | const X86Subtarget &Subtarget) { | |||
| 53610 | SDValue N0 = N->getOperand(0); | |||
| 53611 | SDValue N1 = N->getOperand(1); | |||
| 53612 | EVT VT = N->getValueType(0); | |||
| 53613 | ||||
| 53614 | // If this is SSE1 only convert to FXOR to avoid scalarization. | |||
| 53615 | if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { | |||
| 53616 | return DAG.getBitcast(MVT::v4i32, | |||
| 53617 | DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, | |||
| 53618 | DAG.getBitcast(MVT::v4f32, N0), | |||
| 53619 | DAG.getBitcast(MVT::v4f32, N1))); | |||
| 53620 | } | |||
| 53621 | ||||
| 53622 | if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) | |||
| 53623 | return Cmp; | |||
| 53624 | ||||
| 53625 | if (SDValue R = combineBitOpWithMOVMSK(N, DAG)) | |||
| 53626 | return R; | |||
| 53627 | ||||
| 53628 | if (SDValue R = combineBitOpWithShift(N, DAG)) | |||
| 53629 | return R; | |||
| 53630 | ||||
| 53631 | if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget)) | |||
| 53632 | return FPLogic; | |||
| 53633 | ||||
| 53634 | if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget)) | |||
| 53635 | return R; | |||
| 53636 | ||||
| 53637 | if (DCI.isBeforeLegalizeOps()) | |||
| 53638 | return SDValue(); | |||
| 53639 | ||||
| 53640 | if (SDValue SetCC = foldXor1SetCC(N, DAG)) | |||
| 53641 | return SetCC; | |||
| 53642 | ||||
| 53643 | if (SDValue R = combineOrXorWithSETCC(N, N0, N1, DAG)) | |||
| 53644 | return R; | |||
| 53645 | ||||
| 53646 | if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) | |||
| 53647 | return RV; | |||
| 53648 | ||||
| 53649 | // Fold not(iX bitcast(vXi1)) -> (iX bitcast(not(vec))) for legal boolvecs. | |||
| 53650 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53651 | if (llvm::isAllOnesConstant(N1) && N0.getOpcode() == ISD::BITCAST && | |||
| 53652 | N0.getOperand(0).getValueType().isVector() && | |||
| 53653 | N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && | |||
| 53654 | TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) { | |||
| 53655 | return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0), | |||
| 53656 | N0.getOperand(0).getValueType())); | |||
| 53657 | } | |||
| 53658 | ||||
| 53659 | // Handle AVX512 mask widening. | |||
| 53660 | // Fold not(insert_subvector(undef,sub)) -> insert_subvector(undef,not(sub)) | |||
| 53661 | if (ISD::isBuildVectorAllOnes(N1.getNode()) && VT.isVector() && | |||
| 53662 | VT.getVectorElementType() == MVT::i1 && | |||
| 53663 | N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() && | |||
| 53664 | TLI.isTypeLegal(N0.getOperand(1).getValueType())) { | |||
| 53665 | return DAG.getNode( | |||
| 53666 | ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0), | |||
| 53667 | DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()), | |||
| 53668 | N0.getOperand(2)); | |||
| 53669 | } | |||
| 53670 | ||||
| 53671 | // Fold xor(zext(xor(x,c1)),c2) -> xor(zext(x),xor(zext(c1),c2)) | |||
| 53672 | // Fold xor(truncate(xor(x,c1)),c2) -> xor(truncate(x),xor(truncate(c1),c2)) | |||
| 53673 | // TODO: Under what circumstances could this be performed in DAGCombine? | |||
| 53674 | if ((N0.getOpcode() == ISD::TRUNCATE || N0.getOpcode() == ISD::ZERO_EXTEND) && | |||
| 53675 | N0.getOperand(0).getOpcode() == N->getOpcode()) { | |||
| 53676 | SDValue TruncExtSrc = N0.getOperand(0); | |||
| 53677 | auto *N1C = dyn_cast<ConstantSDNode>(N1); | |||
| 53678 | auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1)); | |||
| 53679 | if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) { | |||
| 53680 | SDLoc DL(N); | |||
| 53681 | SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT); | |||
| 53682 | SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT); | |||
| 53683 | return DAG.getNode(ISD::XOR, DL, VT, LHS, | |||
| 53684 | DAG.getNode(ISD::XOR, DL, VT, RHS, N1)); | |||
| 53685 | } | |||
| 53686 | } | |||
| 53687 | ||||
| 53688 | if (SDValue R = combineBMILogicOp(N, DAG, Subtarget)) | |||
| 53689 | return R; | |||
| 53690 | ||||
| 53691 | return combineFneg(N, DAG, DCI, Subtarget); | |||
| 53692 | } | |||
| 53693 | ||||
| 53694 | static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG, | |||
| 53695 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 53696 | const X86Subtarget &Subtarget) { | |||
| 53697 | EVT VT = N->getValueType(0); | |||
| 53698 | unsigned NumBits = VT.getSizeInBits(); | |||
| 53699 | ||||
| 53700 | // TODO - Constant Folding. | |||
| 53701 | ||||
| 53702 | // Simplify the inputs. | |||
| 53703 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53704 | APInt DemandedMask(APInt::getAllOnes(NumBits)); | |||
| 53705 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) | |||
| 53706 | return SDValue(N, 0); | |||
| 53707 | ||||
| 53708 | return SDValue(); | |||
| 53709 | } | |||
| 53710 | ||||
| 53711 | static bool isNullFPScalarOrVectorConst(SDValue V) { | |||
| 53712 | return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode()); | |||
| 53713 | } | |||
| 53714 | ||||
| 53715 | /// If a value is a scalar FP zero or a vector FP zero (potentially including | |||
| 53716 | /// undefined elements), return a zero constant that may be used to fold away | |||
| 53717 | /// that value. In the case of a vector, the returned constant will not contain | |||
| 53718 | /// undefined elements even if the input parameter does. This makes it suitable | |||
| 53719 | /// to be used as a replacement operand with operations (eg, bitwise-and) where | |||
| 53720 | /// an undef should not propagate. | |||
| 53721 | static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG, | |||
| 53722 | const X86Subtarget &Subtarget) { | |||
| 53723 | if (!isNullFPScalarOrVectorConst(V)) | |||
| 53724 | return SDValue(); | |||
| 53725 | ||||
| 53726 | if (V.getValueType().isVector()) | |||
| 53727 | return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V)); | |||
| 53728 | ||||
| 53729 | return V; | |||
| 53730 | } | |||
| 53731 | ||||
| 53732 | static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, | |||
| 53733 | const X86Subtarget &Subtarget) { | |||
| 53734 | SDValue N0 = N->getOperand(0); | |||
| 53735 | SDValue N1 = N->getOperand(1); | |||
| 53736 | EVT VT = N->getValueType(0); | |||
| 53737 | SDLoc DL(N); | |||
| 53738 | ||||
| 53739 | // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). | |||
| 53740 | if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || | |||
| 53741 | (VT == MVT::f64 && Subtarget.hasSSE2()) || | |||
| 53742 | (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2()))) | |||
| 53743 | return SDValue(); | |||
| 53744 | ||||
| 53745 | auto isAllOnesConstantFP = [](SDValue V) { | |||
| 53746 | if (V.getSimpleValueType().isVector()) | |||
| 53747 | return ISD::isBuildVectorAllOnes(V.getNode()); | |||
| 53748 | auto *C = dyn_cast<ConstantFPSDNode>(V); | |||
| 53749 | return C && C->getConstantFPValue()->isAllOnesValue(); | |||
| 53750 | }; | |||
| 53751 | ||||
| 53752 | // fand (fxor X, -1), Y --> fandn X, Y | |||
| 53753 | if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1))) | |||
| 53754 | return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1); | |||
| 53755 | ||||
| 53756 | // fand X, (fxor Y, -1) --> fandn Y, X | |||
| 53757 | if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1))) | |||
| 53758 | return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0); | |||
| 53759 | ||||
| 53760 | return SDValue(); | |||
| 53761 | } | |||
| 53762 | ||||
| 53763 | /// Do target-specific dag combines on X86ISD::FAND nodes. | |||
| 53764 | static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG, | |||
| 53765 | const X86Subtarget &Subtarget) { | |||
| 53766 | // FAND(0.0, x) -> 0.0 | |||
| 53767 | if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget)) | |||
| 53768 | return V; | |||
| 53769 | ||||
| 53770 | // FAND(x, 0.0) -> 0.0 | |||
| 53771 | if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) | |||
| 53772 | return V; | |||
| 53773 | ||||
| 53774 | if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget)) | |||
| 53775 | return V; | |||
| 53776 | ||||
| 53777 | return lowerX86FPLogicOp(N, DAG, Subtarget); | |||
| 53778 | } | |||
| 53779 | ||||
| 53780 | /// Do target-specific dag combines on X86ISD::FANDN nodes. | |||
| 53781 | static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG, | |||
| 53782 | const X86Subtarget &Subtarget) { | |||
| 53783 | // FANDN(0.0, x) -> x | |||
| 53784 | if (isNullFPScalarOrVectorConst(N->getOperand(0))) | |||
| 53785 | return N->getOperand(1); | |||
| 53786 | ||||
| 53787 | // FANDN(x, 0.0) -> 0.0 | |||
| 53788 | if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget)) | |||
| 53789 | return V; | |||
| 53790 | ||||
| 53791 | return lowerX86FPLogicOp(N, DAG, Subtarget); | |||
| 53792 | } | |||
| 53793 | ||||
| 53794 | /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes. | |||
| 53795 | static SDValue combineFOr(SDNode *N, SelectionDAG &DAG, | |||
| 53796 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 53797 | const X86Subtarget &Subtarget) { | |||
| 53798 | assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR)(static_cast <bool> (N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR) ? void (0) : __assert_fail ("N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53798, __extension__ __PRETTY_FUNCTION__)); | |||
| 53799 | ||||
| 53800 | // F[X]OR(0.0, x) -> x | |||
| 53801 | if (isNullFPScalarOrVectorConst(N->getOperand(0))) | |||
| 53802 | return N->getOperand(1); | |||
| 53803 | ||||
| 53804 | // F[X]OR(x, 0.0) -> x | |||
| 53805 | if (isNullFPScalarOrVectorConst(N->getOperand(1))) | |||
| 53806 | return N->getOperand(0); | |||
| 53807 | ||||
| 53808 | if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget)) | |||
| 53809 | return NewVal; | |||
| 53810 | ||||
| 53811 | return lowerX86FPLogicOp(N, DAG, Subtarget); | |||
| 53812 | } | |||
| 53813 | ||||
| 53814 | /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes. | |||
| 53815 | static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) { | |||
| 53816 | assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX)(static_cast <bool> (N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX) ? void (0) : __assert_fail ("N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53816, __extension__ __PRETTY_FUNCTION__)); | |||
| 53817 | ||||
| 53818 | // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed. | |||
| 53819 | if (!DAG.getTarget().Options.NoNaNsFPMath || | |||
| 53820 | !DAG.getTarget().Options.NoSignedZerosFPMath) | |||
| 53821 | return SDValue(); | |||
| 53822 | ||||
| 53823 | // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes | |||
| 53824 | // into FMINC and FMAXC, which are Commutative operations. | |||
| 53825 | unsigned NewOp = 0; | |||
| 53826 | switch (N->getOpcode()) { | |||
| 53827 | default: llvm_unreachable("unknown opcode")::llvm::llvm_unreachable_internal("unknown opcode", "llvm/lib/Target/X86/X86ISelLowering.cpp" , 53827); | |||
| 53828 | case X86ISD::FMIN: NewOp = X86ISD::FMINC; break; | |||
| 53829 | case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break; | |||
| 53830 | } | |||
| 53831 | ||||
| 53832 | return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0), | |||
| 53833 | N->getOperand(0), N->getOperand(1)); | |||
| 53834 | } | |||
| 53835 | ||||
| 53836 | static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, | |||
| 53837 | const X86Subtarget &Subtarget) { | |||
| 53838 | EVT VT = N->getValueType(0); | |||
| 53839 | if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget)) | |||
| 53840 | return SDValue(); | |||
| 53841 | ||||
| 53842 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53843 | ||||
| 53844 | if (!((Subtarget.hasSSE1() && VT == MVT::f32) || | |||
| 53845 | (Subtarget.hasSSE2() && VT == MVT::f64) || | |||
| 53846 | (Subtarget.hasFP16() && VT == MVT::f16) || | |||
| 53847 | (VT.isVector() && TLI.isTypeLegal(VT)))) | |||
| 53848 | return SDValue(); | |||
| 53849 | ||||
| 53850 | SDValue Op0 = N->getOperand(0); | |||
| 53851 | SDValue Op1 = N->getOperand(1); | |||
| 53852 | SDLoc DL(N); | |||
| 53853 | auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN; | |||
| 53854 | ||||
| 53855 | // If we don't have to respect NaN inputs, this is a direct translation to x86 | |||
| 53856 | // min/max instructions. | |||
| 53857 | if (DAG.getTarget().Options.NoNaNsFPMath || N->getFlags().hasNoNaNs()) | |||
| 53858 | return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); | |||
| 53859 | ||||
| 53860 | // If one of the operands is known non-NaN use the native min/max instructions | |||
| 53861 | // with the non-NaN input as second operand. | |||
| 53862 | if (DAG.isKnownNeverNaN(Op1)) | |||
| 53863 | return DAG.getNode(MinMaxOp, DL, VT, Op0, Op1, N->getFlags()); | |||
| 53864 | if (DAG.isKnownNeverNaN(Op0)) | |||
| 53865 | return DAG.getNode(MinMaxOp, DL, VT, Op1, Op0, N->getFlags()); | |||
| 53866 | ||||
| 53867 | // If we have to respect NaN inputs, this takes at least 3 instructions. | |||
| 53868 | // Favor a library call when operating on a scalar and minimizing code size. | |||
| 53869 | if (!VT.isVector() && DAG.getMachineFunction().getFunction().hasMinSize()) | |||
| 53870 | return SDValue(); | |||
| 53871 | ||||
| 53872 | EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), | |||
| 53873 | VT); | |||
| 53874 | ||||
| 53875 | // There are 4 possibilities involving NaN inputs, and these are the required | |||
| 53876 | // outputs: | |||
| 53877 | // Op1 | |||
| 53878 | // Num NaN | |||
| 53879 | // ---------------- | |||
| 53880 | // Num | Max | Op0 | | |||
| 53881 | // Op0 ---------------- | |||
| 53882 | // NaN | Op1 | NaN | | |||
| 53883 | // ---------------- | |||
| 53884 | // | |||
| 53885 | // The SSE FP max/min instructions were not designed for this case, but rather | |||
| 53886 | // to implement: | |||
| 53887 | // Min = Op1 < Op0 ? Op1 : Op0 | |||
| 53888 | // Max = Op1 > Op0 ? Op1 : Op0 | |||
| 53889 | // | |||
| 53890 | // So they always return Op0 if either input is a NaN. However, we can still | |||
| 53891 | // use those instructions for fmaxnum by selecting away a NaN input. | |||
| 53892 | ||||
| 53893 | // If either operand is NaN, the 2nd source operand (Op0) is passed through. | |||
| 53894 | SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0); | |||
| 53895 | SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType, Op0, Op0, ISD::SETUO); | |||
| 53896 | ||||
| 53897 | // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands | |||
| 53898 | // are NaN, the NaN value of Op1 is the result. | |||
| 53899 | return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); | |||
| 53900 | } | |||
| 53901 | ||||
| 53902 | static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG, | |||
| 53903 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 53904 | EVT VT = N->getValueType(0); | |||
| 53905 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 53906 | ||||
| 53907 | APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); | |||
| 53908 | if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) | |||
| 53909 | return SDValue(N, 0); | |||
| 53910 | ||||
| 53911 | // Convert a full vector load into vzload when not all bits are needed. | |||
| 53912 | SDValue In = N->getOperand(0); | |||
| 53913 | MVT InVT = In.getSimpleValueType(); | |||
| 53914 | if (VT.getVectorNumElements() < InVT.getVectorNumElements() && | |||
| 53915 | ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { | |||
| 53916 | assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector" ) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53916, __extension__ __PRETTY_FUNCTION__)); | |||
| 53917 | LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0)); | |||
| 53918 | unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); | |||
| 53919 | MVT MemVT = MVT::getIntegerVT(NumBits); | |||
| 53920 | MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); | |||
| 53921 | if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { | |||
| 53922 | SDLoc dl(N); | |||
| 53923 | SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT, | |||
| 53924 | DAG.getBitcast(InVT, VZLoad)); | |||
| 53925 | DCI.CombineTo(N, Convert); | |||
| 53926 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 53927 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 53928 | return SDValue(N, 0); | |||
| 53929 | } | |||
| 53930 | } | |||
| 53931 | ||||
| 53932 | return SDValue(); | |||
| 53933 | } | |||
| 53934 | ||||
| 53935 | static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG, | |||
| 53936 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 53937 | bool IsStrict = N->isTargetStrictFPOpcode(); | |||
| 53938 | EVT VT = N->getValueType(0); | |||
| 53939 | ||||
| 53940 | // Convert a full vector load into vzload when not all bits are needed. | |||
| 53941 | SDValue In = N->getOperand(IsStrict ? 1 : 0); | |||
| 53942 | MVT InVT = In.getSimpleValueType(); | |||
| 53943 | if (VT.getVectorNumElements() < InVT.getVectorNumElements() && | |||
| 53944 | ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) { | |||
| 53945 | assert(InVT.is128BitVector() && "Expected 128-bit input vector")(static_cast <bool> (InVT.is128BitVector() && "Expected 128-bit input vector" ) ? void (0) : __assert_fail ("InVT.is128BitVector() && \"Expected 128-bit input vector\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 53945, __extension__ __PRETTY_FUNCTION__)); | |||
| 53946 | LoadSDNode *LN = cast<LoadSDNode>(In); | |||
| 53947 | unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements(); | |||
| 53948 | MVT MemVT = MVT::getFloatingPointVT(NumBits); | |||
| 53949 | MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits); | |||
| 53950 | if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) { | |||
| 53951 | SDLoc dl(N); | |||
| 53952 | if (IsStrict) { | |||
| 53953 | SDValue Convert = | |||
| 53954 | DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other}, | |||
| 53955 | {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)}); | |||
| 53956 | DCI.CombineTo(N, Convert, Convert.getValue(1)); | |||
| 53957 | } else { | |||
| 53958 | SDValue Convert = | |||
| 53959 | DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad)); | |||
| 53960 | DCI.CombineTo(N, Convert); | |||
| 53961 | } | |||
| 53962 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 53963 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 53964 | return SDValue(N, 0); | |||
| 53965 | } | |||
| 53966 | } | |||
| 53967 | ||||
| 53968 | return SDValue(); | |||
| 53969 | } | |||
| 53970 | ||||
| 53971 | /// Do target-specific dag combines on X86ISD::ANDNP nodes. | |||
| 53972 | static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, | |||
| 53973 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 53974 | const X86Subtarget &Subtarget) { | |||
| 53975 | SDValue N0 = N->getOperand(0); | |||
| 53976 | SDValue N1 = N->getOperand(1); | |||
| 53977 | MVT VT = N->getSimpleValueType(0); | |||
| 53978 | int NumElts = VT.getVectorNumElements(); | |||
| 53979 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 53980 | ||||
| 53981 | // ANDNP(undef, x) -> 0 | |||
| 53982 | // ANDNP(x, undef) -> 0 | |||
| 53983 | if (N0.isUndef() || N1.isUndef()) | |||
| 53984 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 53985 | ||||
| 53986 | // ANDNP(0, x) -> x | |||
| 53987 | if (ISD::isBuildVectorAllZeros(N0.getNode())) | |||
| 53988 | return N1; | |||
| 53989 | ||||
| 53990 | // ANDNP(x, 0) -> 0 | |||
| 53991 | if (ISD::isBuildVectorAllZeros(N1.getNode())) | |||
| 53992 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 53993 | ||||
| 53994 | // Turn ANDNP back to AND if input is inverted. | |||
| 53995 | if (SDValue Not = IsNOT(N0, DAG)) | |||
| 53996 | return DAG.getNode(ISD::AND, SDLoc(N), VT, DAG.getBitcast(VT, Not), N1); | |||
| 53997 | ||||
| 53998 | // Constant Folding | |||
| 53999 | APInt Undefs0, Undefs1; | |||
| 54000 | SmallVector<APInt> EltBits0, EltBits1; | |||
| 54001 | if (getTargetConstantBitsFromNode(N0, EltSizeInBits, Undefs0, EltBits0)) { | |||
| 54002 | SDLoc DL(N); | |||
| 54003 | if (getTargetConstantBitsFromNode(N1, EltSizeInBits, Undefs1, EltBits1)) { | |||
| 54004 | SmallVector<APInt> ResultBits; | |||
| 54005 | for (int I = 0; I != NumElts; ++I) | |||
| 54006 | ResultBits.push_back(~EltBits0[I] & EltBits1[I]); | |||
| 54007 | return getConstVector(ResultBits, VT, DAG, DL); | |||
| 54008 | } | |||
| 54009 | ||||
| 54010 | // Constant fold NOT(N0) to allow us to use AND. | |||
| 54011 | // Ensure this is only performed if we can confirm that the bitcasted source | |||
| 54012 | // has oneuse to prevent an infinite loop with canonicalizeBitSelect. | |||
| 54013 | if (N0->hasOneUse()) { | |||
| 54014 | SDValue BC0 = peekThroughOneUseBitcasts(N0); | |||
| 54015 | if (BC0.getOpcode() != ISD::BITCAST) { | |||
| 54016 | for (APInt &Elt : EltBits0) | |||
| 54017 | Elt = ~Elt; | |||
| 54018 | SDValue Not = getConstVector(EltBits0, VT, DAG, DL); | |||
| 54019 | return DAG.getNode(ISD::AND, DL, VT, Not, N1); | |||
| 54020 | } | |||
| 54021 | } | |||
| 54022 | } | |||
| 54023 | ||||
| 54024 | // Attempt to recursively combine a bitmask ANDNP with shuffles. | |||
| 54025 | if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { | |||
| 54026 | SDValue Op(N, 0); | |||
| 54027 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 54028 | return Res; | |||
| 54029 | ||||
| 54030 | // If either operand is a constant mask, then only the elements that aren't | |||
| 54031 | // zero are actually demanded by the other operand. | |||
| 54032 | auto GetDemandedMasks = [&](SDValue Op, bool Invert = false) { | |||
| 54033 | APInt UndefElts; | |||
| 54034 | SmallVector<APInt> EltBits; | |||
| 54035 | APInt DemandedBits = APInt::getAllOnes(EltSizeInBits); | |||
| 54036 | APInt DemandedElts = APInt::getAllOnes(NumElts); | |||
| 54037 | if (getTargetConstantBitsFromNode(Op, EltSizeInBits, UndefElts, | |||
| 54038 | EltBits)) { | |||
| 54039 | DemandedBits.clearAllBits(); | |||
| 54040 | DemandedElts.clearAllBits(); | |||
| 54041 | for (int I = 0; I != NumElts; ++I) { | |||
| 54042 | if (UndefElts[I]) { | |||
| 54043 | // We can't assume an undef src element gives an undef dst - the | |||
| 54044 | // other src might be zero. | |||
| 54045 | DemandedBits.setAllBits(); | |||
| 54046 | DemandedElts.setBit(I); | |||
| 54047 | } else if ((Invert && !EltBits[I].isAllOnes()) || | |||
| 54048 | (!Invert && !EltBits[I].isZero())) { | |||
| 54049 | DemandedBits |= Invert ? ~EltBits[I] : EltBits[I]; | |||
| 54050 | DemandedElts.setBit(I); | |||
| 54051 | } | |||
| 54052 | } | |||
| 54053 | } | |||
| 54054 | return std::make_pair(DemandedBits, DemandedElts); | |||
| 54055 | }; | |||
| 54056 | APInt Bits0, Elts0; | |||
| 54057 | APInt Bits1, Elts1; | |||
| 54058 | std::tie(Bits0, Elts0) = GetDemandedMasks(N1); | |||
| 54059 | std::tie(Bits1, Elts1) = GetDemandedMasks(N0, true); | |||
| 54060 | ||||
| 54061 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54062 | if (TLI.SimplifyDemandedVectorElts(N0, Elts0, DCI) || | |||
| 54063 | TLI.SimplifyDemandedVectorElts(N1, Elts1, DCI) || | |||
| 54064 | TLI.SimplifyDemandedBits(N0, Bits0, Elts0, DCI) || | |||
| 54065 | TLI.SimplifyDemandedBits(N1, Bits1, Elts1, DCI)) { | |||
| 54066 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 54067 | DCI.AddToWorklist(N); | |||
| 54068 | return SDValue(N, 0); | |||
| 54069 | } | |||
| 54070 | } | |||
| 54071 | ||||
| 54072 | return SDValue(); | |||
| 54073 | } | |||
| 54074 | ||||
| 54075 | static SDValue combineBT(SDNode *N, SelectionDAG &DAG, | |||
| 54076 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 54077 | SDValue N1 = N->getOperand(1); | |||
| 54078 | ||||
| 54079 | // BT ignores high bits in the bit index operand. | |||
| 54080 | unsigned BitWidth = N1.getValueSizeInBits(); | |||
| 54081 | APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); | |||
| 54082 | if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) { | |||
| 54083 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 54084 | DCI.AddToWorklist(N); | |||
| 54085 | return SDValue(N, 0); | |||
| 54086 | } | |||
| 54087 | ||||
| 54088 | return SDValue(); | |||
| 54089 | } | |||
| 54090 | ||||
| 54091 | static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG, | |||
| 54092 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 54093 | bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS; | |||
| 54094 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 54095 | ||||
| 54096 | if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) { | |||
| 54097 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54098 | APInt DemandedElts = APInt::getLowBitsSet(8, 4); | |||
| 54099 | if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, DCI)) { | |||
| 54100 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 54101 | DCI.AddToWorklist(N); | |||
| 54102 | return SDValue(N, 0); | |||
| 54103 | } | |||
| 54104 | ||||
| 54105 | // Convert a full vector load into vzload when not all bits are needed. | |||
| 54106 | if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { | |||
| 54107 | LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0)); | |||
| 54108 | if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) { | |||
| 54109 | SDLoc dl(N); | |||
| 54110 | if (IsStrict) { | |||
| 54111 | SDValue Convert = DAG.getNode( | |||
| 54112 | N->getOpcode(), dl, {MVT::v4f32, MVT::Other}, | |||
| 54113 | {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)}); | |||
| 54114 | DCI.CombineTo(N, Convert, Convert.getValue(1)); | |||
| 54115 | } else { | |||
| 54116 | SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32, | |||
| 54117 | DAG.getBitcast(MVT::v8i16, VZLoad)); | |||
| 54118 | DCI.CombineTo(N, Convert); | |||
| 54119 | } | |||
| 54120 | ||||
| 54121 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1)); | |||
| 54122 | DCI.recursivelyDeleteUnusedNodes(LN); | |||
| 54123 | return SDValue(N, 0); | |||
| 54124 | } | |||
| 54125 | } | |||
| 54126 | } | |||
| 54127 | ||||
| 54128 | return SDValue(); | |||
| 54129 | } | |||
| 54130 | ||||
| 54131 | // Try to combine sext_in_reg of a cmov of constants by extending the constants. | |||
| 54132 | static SDValue combineSextInRegCmov(SDNode *N, SelectionDAG &DAG) { | |||
| 54133 | assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG ) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54133, __extension__ __PRETTY_FUNCTION__)); | |||
| 54134 | ||||
| 54135 | EVT DstVT = N->getValueType(0); | |||
| 54136 | ||||
| 54137 | SDValue N0 = N->getOperand(0); | |||
| 54138 | SDValue N1 = N->getOperand(1); | |||
| 54139 | EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); | |||
| 54140 | ||||
| 54141 | if (ExtraVT != MVT::i8 && ExtraVT != MVT::i16) | |||
| 54142 | return SDValue(); | |||
| 54143 | ||||
| 54144 | // Look through single use any_extends / truncs. | |||
| 54145 | SDValue IntermediateBitwidthOp; | |||
| 54146 | if ((N0.getOpcode() == ISD::ANY_EXTEND || N0.getOpcode() == ISD::TRUNCATE) && | |||
| 54147 | N0.hasOneUse()) { | |||
| 54148 | IntermediateBitwidthOp = N0; | |||
| 54149 | N0 = N0.getOperand(0); | |||
| 54150 | } | |||
| 54151 | ||||
| 54152 | // See if we have a single use cmov. | |||
| 54153 | if (N0.getOpcode() != X86ISD::CMOV || !N0.hasOneUse()) | |||
| 54154 | return SDValue(); | |||
| 54155 | ||||
| 54156 | SDValue CMovOp0 = N0.getOperand(0); | |||
| 54157 | SDValue CMovOp1 = N0.getOperand(1); | |||
| 54158 | ||||
| 54159 | // Make sure both operands are constants. | |||
| 54160 | if (!isa<ConstantSDNode>(CMovOp0.getNode()) || | |||
| 54161 | !isa<ConstantSDNode>(CMovOp1.getNode())) | |||
| 54162 | return SDValue(); | |||
| 54163 | ||||
| 54164 | SDLoc DL(N); | |||
| 54165 | ||||
| 54166 | // If we looked through an any_extend/trunc above, add one to the constants. | |||
| 54167 | if (IntermediateBitwidthOp) { | |||
| 54168 | unsigned IntermediateOpc = IntermediateBitwidthOp.getOpcode(); | |||
| 54169 | CMovOp0 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp0); | |||
| 54170 | CMovOp1 = DAG.getNode(IntermediateOpc, DL, DstVT, CMovOp1); | |||
| 54171 | } | |||
| 54172 | ||||
| 54173 | CMovOp0 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp0, N1); | |||
| 54174 | CMovOp1 = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, DstVT, CMovOp1, N1); | |||
| 54175 | ||||
| 54176 | EVT CMovVT = DstVT; | |||
| 54177 | // We do not want i16 CMOV's. Promote to i32 and truncate afterwards. | |||
| 54178 | if (DstVT == MVT::i16) { | |||
| 54179 | CMovVT = MVT::i32; | |||
| 54180 | CMovOp0 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp0); | |||
| 54181 | CMovOp1 = DAG.getNode(ISD::ZERO_EXTEND, DL, CMovVT, CMovOp1); | |||
| 54182 | } | |||
| 54183 | ||||
| 54184 | SDValue CMov = DAG.getNode(X86ISD::CMOV, DL, CMovVT, CMovOp0, CMovOp1, | |||
| 54185 | N0.getOperand(2), N0.getOperand(3)); | |||
| 54186 | ||||
| 54187 | if (CMovVT != DstVT) | |||
| 54188 | CMov = DAG.getNode(ISD::TRUNCATE, DL, DstVT, CMov); | |||
| 54189 | ||||
| 54190 | return CMov; | |||
| 54191 | } | |||
| 54192 | ||||
| 54193 | static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG, | |||
| 54194 | const X86Subtarget &Subtarget) { | |||
| 54195 | assert(N->getOpcode() == ISD::SIGN_EXTEND_INREG)(static_cast <bool> (N->getOpcode() == ISD::SIGN_EXTEND_INREG ) ? void (0) : __assert_fail ("N->getOpcode() == ISD::SIGN_EXTEND_INREG" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54195, __extension__ __PRETTY_FUNCTION__)); | |||
| 54196 | ||||
| 54197 | if (SDValue V = combineSextInRegCmov(N, DAG)) | |||
| 54198 | return V; | |||
| 54199 | ||||
| 54200 | EVT VT = N->getValueType(0); | |||
| 54201 | SDValue N0 = N->getOperand(0); | |||
| 54202 | SDValue N1 = N->getOperand(1); | |||
| 54203 | EVT ExtraVT = cast<VTSDNode>(N1)->getVT(); | |||
| 54204 | SDLoc dl(N); | |||
| 54205 | ||||
| 54206 | // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the | |||
| 54207 | // both SSE and AVX2 since there is no sign-extended shift right | |||
| 54208 | // operation on a vector with 64-bit elements. | |||
| 54209 | //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) -> | |||
| 54210 | // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT))) | |||
| 54211 | if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND || | |||
| 54212 | N0.getOpcode() == ISD::SIGN_EXTEND)) { | |||
| 54213 | SDValue N00 = N0.getOperand(0); | |||
| 54214 | ||||
| 54215 | // EXTLOAD has a better solution on AVX2, | |||
| 54216 | // it may be replaced with X86ISD::VSEXT node. | |||
| 54217 | if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256()) | |||
| 54218 | if (!ISD::isNormalLoad(N00.getNode())) | |||
| 54219 | return SDValue(); | |||
| 54220 | ||||
| 54221 | // Attempt to promote any comparison mask ops before moving the | |||
| 54222 | // SIGN_EXTEND_INREG in the way. | |||
| 54223 | if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget)) | |||
| 54224 | return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1); | |||
| 54225 | ||||
| 54226 | if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) { | |||
| 54227 | SDValue Tmp = | |||
| 54228 | DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1); | |||
| 54229 | return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp); | |||
| 54230 | } | |||
| 54231 | } | |||
| 54232 | return SDValue(); | |||
| 54233 | } | |||
| 54234 | ||||
| 54235 | /// sext(add_nsw(x, C)) --> add(sext(x), C_sext) | |||
| 54236 | /// zext(add_nuw(x, C)) --> add(zext(x), C_zext) | |||
| 54237 | /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes | |||
| 54238 | /// opportunities to combine math ops, use an LEA, or use a complex addressing | |||
| 54239 | /// mode. This can eliminate extend, add, and shift instructions. | |||
| 54240 | static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG, | |||
| 54241 | const X86Subtarget &Subtarget) { | |||
| 54242 | if (Ext->getOpcode() != ISD::SIGN_EXTEND && | |||
| 54243 | Ext->getOpcode() != ISD::ZERO_EXTEND) | |||
| 54244 | return SDValue(); | |||
| 54245 | ||||
| 54246 | // TODO: This should be valid for other integer types. | |||
| 54247 | EVT VT = Ext->getValueType(0); | |||
| 54248 | if (VT != MVT::i64) | |||
| 54249 | return SDValue(); | |||
| 54250 | ||||
| 54251 | SDValue Add = Ext->getOperand(0); | |||
| 54252 | if (Add.getOpcode() != ISD::ADD) | |||
| 54253 | return SDValue(); | |||
| 54254 | ||||
| 54255 | bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND; | |||
| 54256 | bool NSW = Add->getFlags().hasNoSignedWrap(); | |||
| 54257 | bool NUW = Add->getFlags().hasNoUnsignedWrap(); | |||
| 54258 | ||||
| 54259 | // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding | |||
| 54260 | // into the 'zext' | |||
| 54261 | if ((Sext && !NSW) || (!Sext && !NUW)) | |||
| 54262 | return SDValue(); | |||
| 54263 | ||||
| 54264 | // Having a constant operand to the 'add' ensures that we are not increasing | |||
| 54265 | // the instruction count because the constant is extended for free below. | |||
| 54266 | // A constant operand can also become the displacement field of an LEA. | |||
| 54267 | auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1)); | |||
| 54268 | if (!AddOp1) | |||
| 54269 | return SDValue(); | |||
| 54270 | ||||
| 54271 | // Don't make the 'add' bigger if there's no hope of combining it with some | |||
| 54272 | // other 'add' or 'shl' instruction. | |||
| 54273 | // TODO: It may be profitable to generate simpler LEA instructions in place | |||
| 54274 | // of single 'add' instructions, but the cost model for selecting an LEA | |||
| 54275 | // currently has a high threshold. | |||
| 54276 | bool HasLEAPotential = false; | |||
| 54277 | for (auto *User : Ext->uses()) { | |||
| 54278 | if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) { | |||
| 54279 | HasLEAPotential = true; | |||
| 54280 | break; | |||
| 54281 | } | |||
| 54282 | } | |||
| 54283 | if (!HasLEAPotential) | |||
| 54284 | return SDValue(); | |||
| 54285 | ||||
| 54286 | // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'. | |||
| 54287 | int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue(); | |||
| 54288 | SDValue AddOp0 = Add.getOperand(0); | |||
| 54289 | SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0); | |||
| 54290 | SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT); | |||
| 54291 | ||||
| 54292 | // The wider add is guaranteed to not wrap because both operands are | |||
| 54293 | // sign-extended. | |||
| 54294 | SDNodeFlags Flags; | |||
| 54295 | Flags.setNoSignedWrap(NSW); | |||
| 54296 | Flags.setNoUnsignedWrap(NUW); | |||
| 54297 | return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags); | |||
| 54298 | } | |||
| 54299 | ||||
| 54300 | // If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant | |||
| 54301 | // operands and the result of CMOV is not used anywhere else - promote CMOV | |||
| 54302 | // itself instead of promoting its result. This could be beneficial, because: | |||
| 54303 | // 1) X86TargetLowering::EmitLoweredSelect later can do merging of two | |||
| 54304 | // (or more) pseudo-CMOVs only when they go one-after-another and | |||
| 54305 | // getting rid of result extension code after CMOV will help that. | |||
| 54306 | // 2) Promotion of constant CMOV arguments is free, hence the | |||
| 54307 | // {ANY,SIGN,ZERO}_EXTEND will just be deleted. | |||
| 54308 | // 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this | |||
| 54309 | // promotion is also good in terms of code-size. | |||
| 54310 | // (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit | |||
| 54311 | // promotion). | |||
| 54312 | static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { | |||
| 54313 | SDValue CMovN = Extend->getOperand(0); | |||
| 54314 | if (CMovN.getOpcode() != X86ISD::CMOV || !CMovN.hasOneUse()) | |||
| 54315 | return SDValue(); | |||
| 54316 | ||||
| 54317 | EVT TargetVT = Extend->getValueType(0); | |||
| 54318 | unsigned ExtendOpcode = Extend->getOpcode(); | |||
| 54319 | SDLoc DL(Extend); | |||
| 54320 | ||||
| 54321 | EVT VT = CMovN.getValueType(); | |||
| 54322 | SDValue CMovOp0 = CMovN.getOperand(0); | |||
| 54323 | SDValue CMovOp1 = CMovN.getOperand(1); | |||
| 54324 | ||||
| 54325 | if (!isa<ConstantSDNode>(CMovOp0.getNode()) || | |||
| 54326 | !isa<ConstantSDNode>(CMovOp1.getNode())) | |||
| 54327 | return SDValue(); | |||
| 54328 | ||||
| 54329 | // Only extend to i32 or i64. | |||
| 54330 | if (TargetVT != MVT::i32 && TargetVT != MVT::i64) | |||
| 54331 | return SDValue(); | |||
| 54332 | ||||
| 54333 | // Only extend from i16 unless its a sign_extend from i32. Zext/aext from i32 | |||
| 54334 | // are free. | |||
| 54335 | if (VT != MVT::i16 && !(ExtendOpcode == ISD::SIGN_EXTEND && VT == MVT::i32)) | |||
| 54336 | return SDValue(); | |||
| 54337 | ||||
| 54338 | // If this a zero extend to i64, we should only extend to i32 and use a free | |||
| 54339 | // zero extend to finish. | |||
| 54340 | EVT ExtendVT = TargetVT; | |||
| 54341 | if (TargetVT == MVT::i64 && ExtendOpcode != ISD::SIGN_EXTEND) | |||
| 54342 | ExtendVT = MVT::i32; | |||
| 54343 | ||||
| 54344 | CMovOp0 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp0); | |||
| 54345 | CMovOp1 = DAG.getNode(ExtendOpcode, DL, ExtendVT, CMovOp1); | |||
| 54346 | ||||
| 54347 | SDValue Res = DAG.getNode(X86ISD::CMOV, DL, ExtendVT, CMovOp0, CMovOp1, | |||
| 54348 | CMovN.getOperand(2), CMovN.getOperand(3)); | |||
| 54349 | ||||
| 54350 | // Finish extending if needed. | |||
| 54351 | if (ExtendVT != TargetVT) | |||
| 54352 | Res = DAG.getNode(ExtendOpcode, DL, TargetVT, Res); | |||
| 54353 | ||||
| 54354 | return Res; | |||
| 54355 | } | |||
| 54356 | ||||
| 54357 | // Attempt to combine a (sext/zext (setcc)) to a setcc with a xmm/ymm/zmm | |||
| 54358 | // result type. | |||
| 54359 | static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG, | |||
| 54360 | const X86Subtarget &Subtarget) { | |||
| 54361 | SDValue N0 = N->getOperand(0); | |||
| 54362 | EVT VT = N->getValueType(0); | |||
| 54363 | SDLoc dl(N); | |||
| 54364 | ||||
| 54365 | // Only do this combine with AVX512 for vector extends. | |||
| 54366 | if (!Subtarget.hasAVX512() || !VT.isVector() || N0.getOpcode() != ISD::SETCC) | |||
| 54367 | return SDValue(); | |||
| 54368 | ||||
| 54369 | // Only combine legal element types. | |||
| 54370 | EVT SVT = VT.getVectorElementType(); | |||
| 54371 | if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32 && | |||
| 54372 | SVT != MVT::i64 && SVT != MVT::f32 && SVT != MVT::f64) | |||
| 54373 | return SDValue(); | |||
| 54374 | ||||
| 54375 | // We don't have CMPP Instruction for vxf16 | |||
| 54376 | if (N0.getOperand(0).getValueType().getVectorElementType() == MVT::f16) | |||
| 54377 | return SDValue(); | |||
| 54378 | // We can only do this if the vector size in 256 bits or less. | |||
| 54379 | unsigned Size = VT.getSizeInBits(); | |||
| 54380 | if (Size > 256 && Subtarget.useAVX512Regs()) | |||
| 54381 | return SDValue(); | |||
| 54382 | ||||
| 54383 | // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since | |||
| 54384 | // that's the only integer compares with we have. | |||
| 54385 | ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get(); | |||
| 54386 | if (ISD::isUnsignedIntSetCC(CC)) | |||
| 54387 | return SDValue(); | |||
| 54388 | ||||
| 54389 | // Only do this combine if the extension will be fully consumed by the setcc. | |||
| 54390 | EVT N00VT = N0.getOperand(0).getValueType(); | |||
| 54391 | EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger(); | |||
| 54392 | if (Size != MatchingVecType.getSizeInBits()) | |||
| 54393 | return SDValue(); | |||
| 54394 | ||||
| 54395 | SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC); | |||
| 54396 | ||||
| 54397 | if (N->getOpcode() == ISD::ZERO_EXTEND) | |||
| 54398 | Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType()); | |||
| 54399 | ||||
| 54400 | return Res; | |||
| 54401 | } | |||
| 54402 | ||||
| 54403 | static SDValue combineSext(SDNode *N, SelectionDAG &DAG, | |||
| 54404 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54405 | const X86Subtarget &Subtarget) { | |||
| 54406 | SDValue N0 = N->getOperand(0); | |||
| 54407 | EVT VT = N->getValueType(0); | |||
| 54408 | SDLoc DL(N); | |||
| 54409 | ||||
| 54410 | // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) | |||
| 54411 | if (!DCI.isBeforeLegalizeOps() && | |||
| 54412 | N0.getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 54413 | SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0), | |||
| 54414 | N0->getOperand(1)); | |||
| 54415 | bool ReplaceOtherUses = !N0.hasOneUse(); | |||
| 54416 | DCI.CombineTo(N, Setcc); | |||
| 54417 | // Replace other uses with a truncate of the widened setcc_carry. | |||
| 54418 | if (ReplaceOtherUses) { | |||
| 54419 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), | |||
| 54420 | N0.getValueType(), Setcc); | |||
| 54421 | DCI.CombineTo(N0.getNode(), Trunc); | |||
| 54422 | } | |||
| 54423 | ||||
| 54424 | return SDValue(N, 0); | |||
| 54425 | } | |||
| 54426 | ||||
| 54427 | if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) | |||
| 54428 | return NewCMov; | |||
| 54429 | ||||
| 54430 | if (!DCI.isBeforeLegalizeOps()) | |||
| 54431 | return SDValue(); | |||
| 54432 | ||||
| 54433 | if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) | |||
| 54434 | return V; | |||
| 54435 | ||||
| 54436 | if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), DL, VT, N0, | |||
| 54437 | DAG, DCI, Subtarget)) | |||
| 54438 | return V; | |||
| 54439 | ||||
| 54440 | if (VT.isVector()) { | |||
| 54441 | if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) | |||
| 54442 | return R; | |||
| 54443 | ||||
| 54444 | if (N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) | |||
| 54445 | return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0)); | |||
| 54446 | } | |||
| 54447 | ||||
| 54448 | if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) | |||
| 54449 | return NewAdd; | |||
| 54450 | ||||
| 54451 | return SDValue(); | |||
| 54452 | } | |||
| 54453 | ||||
| 54454 | static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, | |||
| 54455 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54456 | const X86Subtarget &Subtarget) { | |||
| 54457 | SDLoc dl(N); | |||
| 54458 | EVT VT = N->getValueType(0); | |||
| 54459 | bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode(); | |||
| 54460 | ||||
| 54461 | // Let legalize expand this if it isn't a legal type yet. | |||
| 54462 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54463 | if (!TLI.isTypeLegal(VT)) | |||
| 54464 | return SDValue(); | |||
| 54465 | ||||
| 54466 | SDValue A = N->getOperand(IsStrict ? 1 : 0); | |||
| 54467 | SDValue B = N->getOperand(IsStrict ? 2 : 1); | |||
| 54468 | SDValue C = N->getOperand(IsStrict ? 3 : 2); | |||
| 54469 | ||||
| 54470 | // If the operation allows fast-math and the target does not support FMA, | |||
| 54471 | // split this into mul+add to avoid libcall(s). | |||
| 54472 | SDNodeFlags Flags = N->getFlags(); | |||
| 54473 | if (!IsStrict && Flags.hasAllowReassociation() && | |||
| 54474 | TLI.isOperationExpand(ISD::FMA, VT)) { | |||
| 54475 | SDValue Fmul = DAG.getNode(ISD::FMUL, dl, VT, A, B, Flags); | |||
| 54476 | return DAG.getNode(ISD::FADD, dl, VT, Fmul, C, Flags); | |||
| 54477 | } | |||
| 54478 | ||||
| 54479 | EVT ScalarVT = VT.getScalarType(); | |||
| 54480 | if (((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || | |||
| 54481 | !Subtarget.hasAnyFMA()) && | |||
| 54482 | !(ScalarVT == MVT::f16 && Subtarget.hasFP16())) | |||
| 54483 | return SDValue(); | |||
| 54484 | ||||
| 54485 | auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) { | |||
| 54486 | bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); | |||
| 54487 | bool LegalOperations = !DCI.isBeforeLegalizeOps(); | |||
| 54488 | if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations, | |||
| 54489 | CodeSize)) { | |||
| 54490 | V = NegV; | |||
| 54491 | return true; | |||
| 54492 | } | |||
| 54493 | // Look through extract_vector_elts. If it comes from an FNEG, create a | |||
| 54494 | // new extract from the FNEG input. | |||
| 54495 | if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 54496 | isNullConstant(V.getOperand(1))) { | |||
| 54497 | SDValue Vec = V.getOperand(0); | |||
| 54498 | if (SDValue NegV = TLI.getCheaperNegatedExpression( | |||
| 54499 | Vec, DAG, LegalOperations, CodeSize)) { | |||
| 54500 | V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(), | |||
| 54501 | NegV, V.getOperand(1)); | |||
| 54502 | return true; | |||
| 54503 | } | |||
| 54504 | } | |||
| 54505 | ||||
| 54506 | return false; | |||
| 54507 | }; | |||
| 54508 | ||||
| 54509 | // Do not convert the passthru input of scalar intrinsics. | |||
| 54510 | // FIXME: We could allow negations of the lower element only. | |||
| 54511 | bool NegA = invertIfNegative(A); | |||
| 54512 | bool NegB = invertIfNegative(B); | |||
| 54513 | bool NegC = invertIfNegative(C); | |||
| 54514 | ||||
| 54515 | if (!NegA && !NegB && !NegC) | |||
| 54516 | return SDValue(); | |||
| 54517 | ||||
| 54518 | unsigned NewOpcode = | |||
| 54519 | negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false); | |||
| 54520 | ||||
| 54521 | // Propagate fast-math-flags to new FMA node. | |||
| 54522 | SelectionDAG::FlagInserter FlagsInserter(DAG, Flags); | |||
| 54523 | if (IsStrict) { | |||
| 54524 | assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4")(static_cast <bool> (N->getNumOperands() == 4 && "Shouldn't be greater than 4") ? void (0) : __assert_fail ("N->getNumOperands() == 4 && \"Shouldn't be greater than 4\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54524, __extension__ __PRETTY_FUNCTION__)); | |||
| 54525 | return DAG.getNode(NewOpcode, dl, {VT, MVT::Other}, | |||
| 54526 | {N->getOperand(0), A, B, C}); | |||
| 54527 | } else { | |||
| 54528 | if (N->getNumOperands() == 4) | |||
| 54529 | return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); | |||
| 54530 | return DAG.getNode(NewOpcode, dl, VT, A, B, C); | |||
| 54531 | } | |||
| 54532 | } | |||
| 54533 | ||||
| 54534 | // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) | |||
| 54535 | // Combine FMSUBADD(A, B, FNEG(C)) -> FMADDSUB(A, B, C) | |||
| 54536 | static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, | |||
| 54537 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 54538 | SDLoc dl(N); | |||
| 54539 | EVT VT = N->getValueType(0); | |||
| 54540 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54541 | bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize(); | |||
| 54542 | bool LegalOperations = !DCI.isBeforeLegalizeOps(); | |||
| 54543 | ||||
| 54544 | SDValue N2 = N->getOperand(2); | |||
| 54545 | ||||
| 54546 | SDValue NegN2 = | |||
| 54547 | TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize); | |||
| 54548 | if (!NegN2) | |||
| 54549 | return SDValue(); | |||
| 54550 | unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false); | |||
| 54551 | ||||
| 54552 | if (N->getNumOperands() == 4) | |||
| 54553 | return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), | |||
| 54554 | NegN2, N->getOperand(3)); | |||
| 54555 | return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), | |||
| 54556 | NegN2); | |||
| 54557 | } | |||
| 54558 | ||||
| 54559 | static SDValue combineZext(SDNode *N, SelectionDAG &DAG, | |||
| 54560 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54561 | const X86Subtarget &Subtarget) { | |||
| 54562 | SDLoc dl(N); | |||
| 54563 | SDValue N0 = N->getOperand(0); | |||
| 54564 | EVT VT = N->getValueType(0); | |||
| 54565 | ||||
| 54566 | // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry)) | |||
| 54567 | // FIXME: Is this needed? We don't seem to have any tests for it. | |||
| 54568 | if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND && | |||
| 54569 | N0.getOpcode() == X86ISD::SETCC_CARRY) { | |||
| 54570 | SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0), | |||
| 54571 | N0->getOperand(1)); | |||
| 54572 | bool ReplaceOtherUses = !N0.hasOneUse(); | |||
| 54573 | DCI.CombineTo(N, Setcc); | |||
| 54574 | // Replace other uses with a truncate of the widened setcc_carry. | |||
| 54575 | if (ReplaceOtherUses) { | |||
| 54576 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), | |||
| 54577 | N0.getValueType(), Setcc); | |||
| 54578 | DCI.CombineTo(N0.getNode(), Trunc); | |||
| 54579 | } | |||
| 54580 | ||||
| 54581 | return SDValue(N, 0); | |||
| 54582 | } | |||
| 54583 | ||||
| 54584 | if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) | |||
| 54585 | return NewCMov; | |||
| 54586 | ||||
| 54587 | if (DCI.isBeforeLegalizeOps()) | |||
| 54588 | if (SDValue V = combineExtSetcc(N, DAG, Subtarget)) | |||
| 54589 | return V; | |||
| 54590 | ||||
| 54591 | if (SDValue V = combineToExtendBoolVectorInReg(N->getOpcode(), dl, VT, N0, | |||
| 54592 | DAG, DCI, Subtarget)) | |||
| 54593 | return V; | |||
| 54594 | ||||
| 54595 | if (VT.isVector()) | |||
| 54596 | if (SDValue R = PromoteMaskArithmetic(N, DAG, Subtarget)) | |||
| 54597 | return R; | |||
| 54598 | ||||
| 54599 | if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget)) | |||
| 54600 | return NewAdd; | |||
| 54601 | ||||
| 54602 | if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget)) | |||
| 54603 | return R; | |||
| 54604 | ||||
| 54605 | // TODO: Combine with any target/faux shuffle. | |||
| 54606 | if (N0.getOpcode() == X86ISD::PACKUS && N0.getValueSizeInBits() == 128 && | |||
| 54607 | VT.getScalarSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits()) { | |||
| 54608 | SDValue N00 = N0.getOperand(0); | |||
| 54609 | SDValue N01 = N0.getOperand(1); | |||
| 54610 | unsigned NumSrcEltBits = N00.getScalarValueSizeInBits(); | |||
| 54611 | APInt ZeroMask = APInt::getHighBitsSet(NumSrcEltBits, NumSrcEltBits / 2); | |||
| 54612 | if ((N00.isUndef() || DAG.MaskedValueIsZero(N00, ZeroMask)) && | |||
| 54613 | (N01.isUndef() || DAG.MaskedValueIsZero(N01, ZeroMask))) { | |||
| 54614 | return concatSubVectors(N00, N01, DAG, dl); | |||
| 54615 | } | |||
| 54616 | } | |||
| 54617 | ||||
| 54618 | return SDValue(); | |||
| 54619 | } | |||
| 54620 | ||||
| 54621 | /// If we have AVX512, but not BWI and this is a vXi16/vXi8 setcc, just | |||
| 54622 | /// pre-promote its result type since vXi1 vectors don't get promoted | |||
| 54623 | /// during type legalization. | |||
| 54624 | static SDValue truncateAVX512SetCCNoBWI(EVT VT, EVT OpVT, SDValue LHS, | |||
| 54625 | SDValue RHS, ISD::CondCode CC, | |||
| 54626 | const SDLoc &DL, SelectionDAG &DAG, | |||
| 54627 | const X86Subtarget &Subtarget) { | |||
| 54628 | if (Subtarget.hasAVX512() && !Subtarget.hasBWI() && VT.isVector() && | |||
| 54629 | VT.getVectorElementType() == MVT::i1 && | |||
| 54630 | (OpVT.getVectorElementType() == MVT::i8 || | |||
| 54631 | OpVT.getVectorElementType() == MVT::i16)) { | |||
| 54632 | SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC); | |||
| 54633 | return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc); | |||
| 54634 | } | |||
| 54635 | return SDValue(); | |||
| 54636 | } | |||
| 54637 | ||||
| 54638 | static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, | |||
| 54639 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54640 | const X86Subtarget &Subtarget) { | |||
| 54641 | const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); | |||
| 54642 | const SDValue LHS = N->getOperand(0); | |||
| 54643 | const SDValue RHS = N->getOperand(1); | |||
| 54644 | EVT VT = N->getValueType(0); | |||
| 54645 | EVT OpVT = LHS.getValueType(); | |||
| 54646 | SDLoc DL(N); | |||
| 54647 | ||||
| 54648 | if (CC == ISD::SETNE || CC == ISD::SETEQ) { | |||
| 54649 | if (SDValue V = combineVectorSizedSetCCEquality(VT, LHS, RHS, CC, DL, DAG, | |||
| 54650 | Subtarget)) | |||
| 54651 | return V; | |||
| 54652 | ||||
| 54653 | if (VT == MVT::i1) { | |||
| 54654 | X86::CondCode X86CC; | |||
| 54655 | if (SDValue V = | |||
| 54656 | MatchVectorAllEqualTest(LHS, RHS, CC, DL, Subtarget, DAG, X86CC)) | |||
| 54657 | return DAG.getNode(ISD::TRUNCATE, DL, VT, getSETCC(X86CC, V, DL, DAG)); | |||
| 54658 | } | |||
| 54659 | ||||
| 54660 | if (OpVT.isScalarInteger()) { | |||
| 54661 | // cmpeq(or(X,Y),X) --> cmpeq(and(~X,Y),0) | |||
| 54662 | // cmpne(or(X,Y),X) --> cmpne(and(~X,Y),0) | |||
| 54663 | auto MatchOrCmpEq = [&](SDValue N0, SDValue N1) { | |||
| 54664 | if (N0.getOpcode() == ISD::OR && N0->hasOneUse()) { | |||
| 54665 | if (N0.getOperand(0) == N1) | |||
| 54666 | return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT), | |||
| 54667 | N0.getOperand(1)); | |||
| 54668 | if (N0.getOperand(1) == N1) | |||
| 54669 | return DAG.getNode(ISD::AND, DL, OpVT, DAG.getNOT(DL, N1, OpVT), | |||
| 54670 | N0.getOperand(0)); | |||
| 54671 | } | |||
| 54672 | return SDValue(); | |||
| 54673 | }; | |||
| 54674 | if (SDValue AndN = MatchOrCmpEq(LHS, RHS)) | |||
| 54675 | return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); | |||
| 54676 | if (SDValue AndN = MatchOrCmpEq(RHS, LHS)) | |||
| 54677 | return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); | |||
| 54678 | ||||
| 54679 | // cmpeq(and(X,Y),Y) --> cmpeq(and(~X,Y),0) | |||
| 54680 | // cmpne(and(X,Y),Y) --> cmpne(and(~X,Y),0) | |||
| 54681 | auto MatchAndCmpEq = [&](SDValue N0, SDValue N1) { | |||
| 54682 | if (N0.getOpcode() == ISD::AND && N0->hasOneUse()) { | |||
| 54683 | if (N0.getOperand(0) == N1) | |||
| 54684 | return DAG.getNode(ISD::AND, DL, OpVT, N1, | |||
| 54685 | DAG.getNOT(DL, N0.getOperand(1), OpVT)); | |||
| 54686 | if (N0.getOperand(1) == N1) | |||
| 54687 | return DAG.getNode(ISD::AND, DL, OpVT, N1, | |||
| 54688 | DAG.getNOT(DL, N0.getOperand(0), OpVT)); | |||
| 54689 | } | |||
| 54690 | return SDValue(); | |||
| 54691 | }; | |||
| 54692 | if (SDValue AndN = MatchAndCmpEq(LHS, RHS)) | |||
| 54693 | return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); | |||
| 54694 | if (SDValue AndN = MatchAndCmpEq(RHS, LHS)) | |||
| 54695 | return DAG.getSetCC(DL, VT, AndN, DAG.getConstant(0, DL, OpVT), CC); | |||
| 54696 | ||||
| 54697 | // cmpeq(trunc(x),C) --> cmpeq(x,C) | |||
| 54698 | // cmpne(trunc(x),C) --> cmpne(x,C) | |||
| 54699 | // iff x upper bits are zero. | |||
| 54700 | if (LHS.getOpcode() == ISD::TRUNCATE && | |||
| 54701 | LHS.getOperand(0).getScalarValueSizeInBits() >= 32 && | |||
| 54702 | isa<ConstantSDNode>(RHS) && !DCI.isBeforeLegalize()) { | |||
| 54703 | EVT SrcVT = LHS.getOperand(0).getValueType(); | |||
| 54704 | APInt UpperBits = APInt::getBitsSetFrom(SrcVT.getScalarSizeInBits(), | |||
| 54705 | OpVT.getScalarSizeInBits()); | |||
| 54706 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54707 | auto *C = cast<ConstantSDNode>(RHS); | |||
| 54708 | if (DAG.MaskedValueIsZero(LHS.getOperand(0), UpperBits) && | |||
| 54709 | TLI.isTypeLegal(LHS.getOperand(0).getValueType())) | |||
| 54710 | return DAG.getSetCC(DL, VT, LHS.getOperand(0), | |||
| 54711 | DAG.getConstant(C->getAPIntValue().zextOrTrunc( | |||
| 54712 | SrcVT.getScalarSizeInBits()), | |||
| 54713 | DL, SrcVT), | |||
| 54714 | CC); | |||
| 54715 | } | |||
| 54716 | ||||
| 54717 | // With C as a power of 2 and C != 0 and C != INT_MIN: | |||
| 54718 | // icmp eq Abs(X) C -> | |||
| 54719 | // (icmp eq A, C) | (icmp eq A, -C) | |||
| 54720 | // icmp ne Abs(X) C -> | |||
| 54721 | // (icmp ne A, C) & (icmp ne A, -C) | |||
| 54722 | // Both of these patterns can be better optimized in | |||
| 54723 | // DAGCombiner::foldAndOrOfSETCC. Note this only applies for scalar | |||
| 54724 | // integers which is checked above. | |||
| 54725 | if (LHS.getOpcode() == ISD::ABS && LHS.hasOneUse()) { | |||
| 54726 | if (auto *C = dyn_cast<ConstantSDNode>(RHS)) { | |||
| 54727 | const APInt &CInt = C->getAPIntValue(); | |||
| 54728 | // We can better optimize this case in DAGCombiner::foldAndOrOfSETCC. | |||
| 54729 | if (CInt.isPowerOf2() && !CInt.isMinSignedValue()) { | |||
| 54730 | SDValue BaseOp = LHS.getOperand(0); | |||
| 54731 | SDValue SETCC0 = DAG.getSetCC(DL, VT, BaseOp, RHS, CC); | |||
| 54732 | SDValue SETCC1 = DAG.getSetCC( | |||
| 54733 | DL, VT, BaseOp, DAG.getConstant(-CInt, DL, OpVT), CC); | |||
| 54734 | return DAG.getNode(CC == ISD::SETEQ ? ISD::OR : ISD::AND, DL, VT, | |||
| 54735 | SETCC0, SETCC1); | |||
| 54736 | } | |||
| 54737 | } | |||
| 54738 | } | |||
| 54739 | } | |||
| 54740 | } | |||
| 54741 | ||||
| 54742 | if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && | |||
| 54743 | (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { | |||
| 54744 | // Using temporaries to avoid messing up operand ordering for later | |||
| 54745 | // transformations if this doesn't work. | |||
| 54746 | SDValue Op0 = LHS; | |||
| 54747 | SDValue Op1 = RHS; | |||
| 54748 | ISD::CondCode TmpCC = CC; | |||
| 54749 | // Put build_vector on the right. | |||
| 54750 | if (Op0.getOpcode() == ISD::BUILD_VECTOR) { | |||
| 54751 | std::swap(Op0, Op1); | |||
| 54752 | TmpCC = ISD::getSetCCSwappedOperands(TmpCC); | |||
| 54753 | } | |||
| 54754 | ||||
| 54755 | bool IsSEXT0 = | |||
| 54756 | (Op0.getOpcode() == ISD::SIGN_EXTEND) && | |||
| 54757 | (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1); | |||
| 54758 | bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode()); | |||
| 54759 | ||||
| 54760 | if (IsSEXT0 && IsVZero1) { | |||
| 54761 | assert(VT == Op0.getOperand(0).getValueType() &&(static_cast <bool> (VT == Op0.getOperand(0).getValueType () && "Unexpected operand type") ? void (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__ __PRETTY_FUNCTION__)) | |||
| 54762 | "Unexpected operand type")(static_cast <bool> (VT == Op0.getOperand(0).getValueType () && "Unexpected operand type") ? void (0) : __assert_fail ("VT == Op0.getOperand(0).getValueType() && \"Unexpected operand type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54762, __extension__ __PRETTY_FUNCTION__)); | |||
| 54763 | if (TmpCC == ISD::SETGT) | |||
| 54764 | return DAG.getConstant(0, DL, VT); | |||
| 54765 | if (TmpCC == ISD::SETLE) | |||
| 54766 | return DAG.getConstant(1, DL, VT); | |||
| 54767 | if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE) | |||
| 54768 | return DAG.getNOT(DL, Op0.getOperand(0), VT); | |||
| 54769 | ||||
| 54770 | assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD ::SETLT) && "Unexpected condition code!") ? void (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__ __PRETTY_FUNCTION__)) | |||
| 54771 | "Unexpected condition code!")(static_cast <bool> ((TmpCC == ISD::SETNE || TmpCC == ISD ::SETLT) && "Unexpected condition code!") ? void (0) : __assert_fail ("(TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) && \"Unexpected condition code!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54771, __extension__ __PRETTY_FUNCTION__)); | |||
| 54772 | return Op0.getOperand(0); | |||
| 54773 | } | |||
| 54774 | } | |||
| 54775 | ||||
| 54776 | // Try and make unsigned vector comparison signed. On pre AVX512 targets there | |||
| 54777 | // only are unsigned comparisons (`PCMPGT`) and on AVX512 its often better to | |||
| 54778 | // use `PCMPGT` if the result is mean to stay in a vector (and if its going to | |||
| 54779 | // a mask, there are signed AVX512 comparisons). | |||
| 54780 | if (VT.isVector() && OpVT.isVector() && OpVT.isInteger()) { | |||
| 54781 | bool CanMakeSigned = false; | |||
| 54782 | if (ISD::isUnsignedIntSetCC(CC)) { | |||
| 54783 | KnownBits CmpKnown = KnownBits::commonBits(DAG.computeKnownBits(LHS), | |||
| 54784 | DAG.computeKnownBits(RHS)); | |||
| 54785 | // If we know LHS/RHS share the same sign bit at each element we can | |||
| 54786 | // make this signed. | |||
| 54787 | // NOTE: `computeKnownBits` on a vector type aggregates common bits | |||
| 54788 | // across all lanes. So a pattern where the sign varies from lane to | |||
| 54789 | // lane, but at each lane Sign(LHS) is known to equal Sign(RHS), will be | |||
| 54790 | // missed. We could get around this by demanding each lane | |||
| 54791 | // independently, but this isn't the most important optimization and | |||
| 54792 | // that may eat into compile time. | |||
| 54793 | CanMakeSigned = | |||
| 54794 | CmpKnown.Zero.isSignBitSet() || CmpKnown.One.isSignBitSet(); | |||
| 54795 | } | |||
| 54796 | if (CanMakeSigned || ISD::isSignedIntSetCC(CC)) { | |||
| 54797 | SDValue LHSOut = LHS; | |||
| 54798 | SDValue RHSOut = RHS; | |||
| 54799 | ISD::CondCode NewCC = CC; | |||
| 54800 | switch (CC) { | |||
| 54801 | case ISD::SETGE: | |||
| 54802 | case ISD::SETUGE: | |||
| 54803 | if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ true, | |||
| 54804 | /*NSW*/ true)) | |||
| 54805 | LHSOut = NewLHS; | |||
| 54806 | else if (SDValue NewRHS = incDecVectorConstant( | |||
| 54807 | RHS, DAG, /*IsInc*/ false, /*NSW*/ true)) | |||
| 54808 | RHSOut = NewRHS; | |||
| 54809 | else | |||
| 54810 | break; | |||
| 54811 | ||||
| 54812 | [[fallthrough]]; | |||
| 54813 | case ISD::SETUGT: | |||
| 54814 | NewCC = ISD::SETGT; | |||
| 54815 | break; | |||
| 54816 | ||||
| 54817 | case ISD::SETLE: | |||
| 54818 | case ISD::SETULE: | |||
| 54819 | if (SDValue NewLHS = incDecVectorConstant(LHS, DAG, /*IsInc*/ false, | |||
| 54820 | /*NSW*/ true)) | |||
| 54821 | LHSOut = NewLHS; | |||
| 54822 | else if (SDValue NewRHS = incDecVectorConstant(RHS, DAG, /*IsInc*/ true, | |||
| 54823 | /*NSW*/ true)) | |||
| 54824 | RHSOut = NewRHS; | |||
| 54825 | else | |||
| 54826 | break; | |||
| 54827 | ||||
| 54828 | [[fallthrough]]; | |||
| 54829 | case ISD::SETULT: | |||
| 54830 | // Will be swapped to SETGT in LowerVSETCC*. | |||
| 54831 | NewCC = ISD::SETLT; | |||
| 54832 | break; | |||
| 54833 | default: | |||
| 54834 | break; | |||
| 54835 | } | |||
| 54836 | if (NewCC != CC) { | |||
| 54837 | if (SDValue R = truncateAVX512SetCCNoBWI(VT, OpVT, LHSOut, RHSOut, | |||
| 54838 | NewCC, DL, DAG, Subtarget)) | |||
| 54839 | return R; | |||
| 54840 | return DAG.getSetCC(DL, VT, LHSOut, RHSOut, NewCC); | |||
| 54841 | } | |||
| 54842 | } | |||
| 54843 | } | |||
| 54844 | ||||
| 54845 | if (SDValue R = | |||
| 54846 | truncateAVX512SetCCNoBWI(VT, OpVT, LHS, RHS, CC, DL, DAG, Subtarget)) | |||
| 54847 | return R; | |||
| 54848 | ||||
| 54849 | // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early | |||
| 54850 | // to avoid scalarization via legalization because v4i32 is not a legal type. | |||
| 54851 | if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 && | |||
| 54852 | LHS.getValueType() == MVT::v4f32) | |||
| 54853 | return LowerVSETCC(SDValue(N, 0), Subtarget, DAG); | |||
| 54854 | ||||
| 54855 | // X pred 0.0 --> X pred -X | |||
| 54856 | // If the negation of X already exists, use it in the comparison. This removes | |||
| 54857 | // the need to materialize 0.0 and allows matching to SSE's MIN/MAX | |||
| 54858 | // instructions in patterns with a 'select' node. | |||
| 54859 | if (isNullFPScalarOrVectorConst(RHS)) { | |||
| 54860 | SDVTList FNegVT = DAG.getVTList(OpVT); | |||
| 54861 | if (SDNode *FNeg = DAG.getNodeIfExists(ISD::FNEG, FNegVT, {LHS})) | |||
| 54862 | return DAG.getSetCC(DL, VT, LHS, SDValue(FNeg, 0), CC); | |||
| 54863 | } | |||
| 54864 | ||||
| 54865 | return SDValue(); | |||
| 54866 | } | |||
| 54867 | ||||
| 54868 | static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, | |||
| 54869 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54870 | const X86Subtarget &Subtarget) { | |||
| 54871 | SDValue Src = N->getOperand(0); | |||
| 54872 | MVT SrcVT = Src.getSimpleValueType(); | |||
| 54873 | MVT VT = N->getSimpleValueType(0); | |||
| 54874 | unsigned NumBits = VT.getScalarSizeInBits(); | |||
| 54875 | unsigned NumElts = SrcVT.getVectorNumElements(); | |||
| 54876 | unsigned NumBitsPerElt = SrcVT.getScalarSizeInBits(); | |||
| 54877 | assert(VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types")(static_cast <bool> (VT == MVT::i32 && NumElts <= NumBits && "Unexpected MOVMSK types") ? void (0) : __assert_fail ("VT == MVT::i32 && NumElts <= NumBits && \"Unexpected MOVMSK types\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 54877, __extension__ __PRETTY_FUNCTION__)); | |||
| 54878 | ||||
| 54879 | // Perform constant folding. | |||
| 54880 | APInt UndefElts; | |||
| 54881 | SmallVector<APInt, 32> EltBits; | |||
| 54882 | if (getTargetConstantBitsFromNode(Src, NumBitsPerElt, UndefElts, EltBits)) { | |||
| 54883 | APInt Imm(32, 0); | |||
| 54884 | for (unsigned Idx = 0; Idx != NumElts; ++Idx) | |||
| 54885 | if (!UndefElts[Idx] && EltBits[Idx].isNegative()) | |||
| 54886 | Imm.setBit(Idx); | |||
| 54887 | ||||
| 54888 | return DAG.getConstant(Imm, SDLoc(N), VT); | |||
| 54889 | } | |||
| 54890 | ||||
| 54891 | // Look through int->fp bitcasts that don't change the element width. | |||
| 54892 | unsigned EltWidth = SrcVT.getScalarSizeInBits(); | |||
| 54893 | if (Subtarget.hasSSE2() && Src.getOpcode() == ISD::BITCAST && | |||
| 54894 | Src.getOperand(0).getScalarValueSizeInBits() == EltWidth) | |||
| 54895 | return DAG.getNode(X86ISD::MOVMSK, SDLoc(N), VT, Src.getOperand(0)); | |||
| 54896 | ||||
| 54897 | // Fold movmsk(not(x)) -> not(movmsk(x)) to improve folding of movmsk results | |||
| 54898 | // with scalar comparisons. | |||
| 54899 | if (SDValue NotSrc = IsNOT(Src, DAG)) { | |||
| 54900 | SDLoc DL(N); | |||
| 54901 | APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); | |||
| 54902 | NotSrc = DAG.getBitcast(SrcVT, NotSrc); | |||
| 54903 | return DAG.getNode(ISD::XOR, DL, VT, | |||
| 54904 | DAG.getNode(X86ISD::MOVMSK, DL, VT, NotSrc), | |||
| 54905 | DAG.getConstant(NotMask, DL, VT)); | |||
| 54906 | } | |||
| 54907 | ||||
| 54908 | // Fold movmsk(icmp_sgt(x,-1)) -> not(movmsk(x)) to improve folding of movmsk | |||
| 54909 | // results with scalar comparisons. | |||
| 54910 | if (Src.getOpcode() == X86ISD::PCMPGT && | |||
| 54911 | ISD::isBuildVectorAllOnes(Src.getOperand(1).getNode())) { | |||
| 54912 | SDLoc DL(N); | |||
| 54913 | APInt NotMask = APInt::getLowBitsSet(NumBits, NumElts); | |||
| 54914 | return DAG.getNode(ISD::XOR, DL, VT, | |||
| 54915 | DAG.getNode(X86ISD::MOVMSK, DL, VT, Src.getOperand(0)), | |||
| 54916 | DAG.getConstant(NotMask, DL, VT)); | |||
| 54917 | } | |||
| 54918 | ||||
| 54919 | // Fold movmsk(icmp_eq(and(x,c1),c1)) -> movmsk(shl(x,c2)) | |||
| 54920 | // Fold movmsk(icmp_eq(and(x,c1),0)) -> movmsk(not(shl(x,c2))) | |||
| 54921 | // iff pow2splat(c1). | |||
| 54922 | // Use KnownBits to determine if only a single bit is non-zero | |||
| 54923 | // in each element (pow2 or zero), and shift that bit to the msb. | |||
| 54924 | if (Src.getOpcode() == X86ISD::PCMPEQ) { | |||
| 54925 | KnownBits KnownLHS = DAG.computeKnownBits(Src.getOperand(0)); | |||
| 54926 | KnownBits KnownRHS = DAG.computeKnownBits(Src.getOperand(1)); | |||
| 54927 | unsigned ShiftAmt = KnownLHS.countMinLeadingZeros(); | |||
| 54928 | if (KnownLHS.countMaxPopulation() == 1 && | |||
| 54929 | (KnownRHS.isZero() || (KnownRHS.countMaxPopulation() == 1 && | |||
| 54930 | ShiftAmt == KnownRHS.countMinLeadingZeros()))) { | |||
| 54931 | SDLoc DL(N); | |||
| 54932 | MVT ShiftVT = SrcVT; | |||
| 54933 | SDValue ShiftLHS = Src.getOperand(0); | |||
| 54934 | SDValue ShiftRHS = Src.getOperand(1); | |||
| 54935 | if (ShiftVT.getScalarType() == MVT::i8) { | |||
| 54936 | // vXi8 shifts - we only care about the signbit so can use PSLLW. | |||
| 54937 | ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2); | |||
| 54938 | ShiftLHS = DAG.getBitcast(ShiftVT, ShiftLHS); | |||
| 54939 | ShiftRHS = DAG.getBitcast(ShiftVT, ShiftRHS); | |||
| 54940 | } | |||
| 54941 | ShiftLHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, | |||
| 54942 | ShiftLHS, ShiftAmt, DAG); | |||
| 54943 | ShiftRHS = getTargetVShiftByConstNode(X86ISD::VSHLI, DL, ShiftVT, | |||
| 54944 | ShiftRHS, ShiftAmt, DAG); | |||
| 54945 | ShiftLHS = DAG.getBitcast(SrcVT, ShiftLHS); | |||
| 54946 | ShiftRHS = DAG.getBitcast(SrcVT, ShiftRHS); | |||
| 54947 | SDValue Res = DAG.getNode(ISD::XOR, DL, SrcVT, ShiftLHS, ShiftRHS); | |||
| 54948 | return DAG.getNode(X86ISD::MOVMSK, DL, VT, DAG.getNOT(DL, Res, SrcVT)); | |||
| 54949 | } | |||
| 54950 | } | |||
| 54951 | ||||
| 54952 | // Fold movmsk(logic(X,C)) -> logic(movmsk(X),C) | |||
| 54953 | if (N->isOnlyUserOf(Src.getNode())) { | |||
| 54954 | SDValue SrcBC = peekThroughOneUseBitcasts(Src); | |||
| 54955 | if (ISD::isBitwiseLogicOp(SrcBC.getOpcode())) { | |||
| 54956 | APInt UndefElts; | |||
| 54957 | SmallVector<APInt, 32> EltBits; | |||
| 54958 | if (getTargetConstantBitsFromNode(SrcBC.getOperand(1), NumBitsPerElt, | |||
| 54959 | UndefElts, EltBits)) { | |||
| 54960 | APInt Mask = APInt::getZero(NumBits); | |||
| 54961 | for (unsigned Idx = 0; Idx != NumElts; ++Idx) { | |||
| 54962 | if (!UndefElts[Idx] && EltBits[Idx].isNegative()) | |||
| 54963 | Mask.setBit(Idx); | |||
| 54964 | } | |||
| 54965 | SDLoc DL(N); | |||
| 54966 | SDValue NewSrc = DAG.getBitcast(SrcVT, SrcBC.getOperand(0)); | |||
| 54967 | SDValue NewMovMsk = DAG.getNode(X86ISD::MOVMSK, DL, VT, NewSrc); | |||
| 54968 | return DAG.getNode(SrcBC.getOpcode(), DL, VT, NewMovMsk, | |||
| 54969 | DAG.getConstant(Mask, DL, VT)); | |||
| 54970 | } | |||
| 54971 | } | |||
| 54972 | } | |||
| 54973 | ||||
| 54974 | // Simplify the inputs. | |||
| 54975 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54976 | APInt DemandedMask(APInt::getAllOnes(NumBits)); | |||
| 54977 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) | |||
| 54978 | return SDValue(N, 0); | |||
| 54979 | ||||
| 54980 | return SDValue(); | |||
| 54981 | } | |||
| 54982 | ||||
| 54983 | static SDValue combineTESTP(SDNode *N, SelectionDAG &DAG, | |||
| 54984 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 54985 | const X86Subtarget &Subtarget) { | |||
| 54986 | MVT VT = N->getSimpleValueType(0); | |||
| 54987 | unsigned NumBits = VT.getScalarSizeInBits(); | |||
| 54988 | ||||
| 54989 | // Simplify the inputs. | |||
| 54990 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 54991 | APInt DemandedMask(APInt::getAllOnes(NumBits)); | |||
| 54992 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI)) | |||
| 54993 | return SDValue(N, 0); | |||
| 54994 | ||||
| 54995 | return SDValue(); | |||
| 54996 | } | |||
| 54997 | ||||
| 54998 | static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG, | |||
| 54999 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 55000 | const X86Subtarget &Subtarget) { | |||
| 55001 | auto *MemOp = cast<X86MaskedGatherScatterSDNode>(N); | |||
| 55002 | SDValue BasePtr = MemOp->getBasePtr(); | |||
| 55003 | SDValue Index = MemOp->getIndex(); | |||
| 55004 | SDValue Scale = MemOp->getScale(); | |||
| 55005 | SDValue Mask = MemOp->getMask(); | |||
| 55006 | ||||
| 55007 | // Attempt to fold an index scale into the scale value directly. | |||
| 55008 | // For smaller indices, implicit sext is performed BEFORE scale, preventing | |||
| 55009 | // this fold under most circumstances. | |||
| 55010 | // TODO: Move this into X86DAGToDAGISel::matchVectorAddressRecursively? | |||
| 55011 | if ((Index.getOpcode() == X86ISD::VSHLI || | |||
| 55012 | (Index.getOpcode() == ISD::ADD && | |||
| 55013 | Index.getOperand(0) == Index.getOperand(1))) && | |||
| 55014 | isa<ConstantSDNode>(Scale) && | |||
| 55015 | BasePtr.getScalarValueSizeInBits() == Index.getScalarValueSizeInBits()) { | |||
| 55016 | unsigned ShiftAmt = | |||
| 55017 | Index.getOpcode() == ISD::ADD ? 1 : Index.getConstantOperandVal(1); | |||
| 55018 | uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue(); | |||
| 55019 | uint64_t NewScaleAmt = ScaleAmt * (1ULL << ShiftAmt); | |||
| 55020 | if (isPowerOf2_64(NewScaleAmt) && NewScaleAmt <= 8) { | |||
| 55021 | SDValue NewIndex = Index.getOperand(0); | |||
| 55022 | SDValue NewScale = | |||
| 55023 | DAG.getTargetConstant(NewScaleAmt, SDLoc(N), Scale.getValueType()); | |||
| 55024 | if (N->getOpcode() == X86ISD::MGATHER) | |||
| 55025 | return getAVX2GatherNode(N->getOpcode(), SDValue(N, 0), DAG, | |||
| 55026 | MemOp->getOperand(1), Mask, | |||
| 55027 | MemOp->getBasePtr(), NewIndex, NewScale, | |||
| 55028 | MemOp->getChain(), Subtarget); | |||
| 55029 | if (N->getOpcode() == X86ISD::MSCATTER) | |||
| 55030 | return getScatterNode(N->getOpcode(), SDValue(N, 0), DAG, | |||
| 55031 | MemOp->getOperand(1), Mask, MemOp->getBasePtr(), | |||
| 55032 | NewIndex, NewScale, MemOp->getChain(), Subtarget); | |||
| 55033 | } | |||
| 55034 | } | |||
| 55035 | ||||
| 55036 | // With vector masks we only demand the upper bit of the mask. | |||
| 55037 | if (Mask.getScalarValueSizeInBits() != 1) { | |||
| 55038 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 55039 | APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); | |||
| 55040 | if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { | |||
| 55041 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 55042 | DCI.AddToWorklist(N); | |||
| 55043 | return SDValue(N, 0); | |||
| 55044 | } | |||
| 55045 | } | |||
| 55046 | ||||
| 55047 | return SDValue(); | |||
| 55048 | } | |||
| 55049 | ||||
| 55050 | static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS, | |||
| 55051 | SDValue Index, SDValue Base, SDValue Scale, | |||
| 55052 | SelectionDAG &DAG) { | |||
| 55053 | SDLoc DL(GorS); | |||
| 55054 | ||||
| 55055 | if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) { | |||
| 55056 | SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(), | |||
| 55057 | Gather->getMask(), Base, Index, Scale } ; | |||
| 55058 | return DAG.getMaskedGather(Gather->getVTList(), | |||
| 55059 | Gather->getMemoryVT(), DL, Ops, | |||
| 55060 | Gather->getMemOperand(), | |||
| 55061 | Gather->getIndexType(), | |||
| 55062 | Gather->getExtensionType()); | |||
| 55063 | } | |||
| 55064 | auto *Scatter = cast<MaskedScatterSDNode>(GorS); | |||
| 55065 | SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(), | |||
| 55066 | Scatter->getMask(), Base, Index, Scale }; | |||
| 55067 | return DAG.getMaskedScatter(Scatter->getVTList(), | |||
| 55068 | Scatter->getMemoryVT(), DL, | |||
| 55069 | Ops, Scatter->getMemOperand(), | |||
| 55070 | Scatter->getIndexType(), | |||
| 55071 | Scatter->isTruncatingStore()); | |||
| 55072 | } | |||
| 55073 | ||||
| 55074 | static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, | |||
| 55075 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 55076 | SDLoc DL(N); | |||
| 55077 | auto *GorS = cast<MaskedGatherScatterSDNode>(N); | |||
| 55078 | SDValue Index = GorS->getIndex(); | |||
| 55079 | SDValue Base = GorS->getBasePtr(); | |||
| 55080 | SDValue Scale = GorS->getScale(); | |||
| 55081 | ||||
| 55082 | if (DCI.isBeforeLegalize()) { | |||
| 55083 | unsigned IndexWidth = Index.getScalarValueSizeInBits(); | |||
| 55084 | ||||
| 55085 | // Shrink constant indices if they are larger than 32-bits. | |||
| 55086 | // Only do this before legalize types since v2i64 could become v2i32. | |||
| 55087 | // FIXME: We could check that the type is legal if we're after legalize | |||
| 55088 | // types, but then we would need to construct test cases where that happens. | |||
| 55089 | // FIXME: We could support more than just constant vectors, but we need to | |||
| 55090 | // careful with costing. A truncate that can be optimized out would be fine. | |||
| 55091 | // Otherwise we might only want to create a truncate if it avoids a split. | |||
| 55092 | if (auto *BV = dyn_cast<BuildVectorSDNode>(Index)) { | |||
| 55093 | if (BV->isConstant() && IndexWidth > 32 && | |||
| 55094 | DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { | |||
| 55095 | EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32); | |||
| 55096 | Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); | |||
| 55097 | return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); | |||
| 55098 | } | |||
| 55099 | } | |||
| 55100 | ||||
| 55101 | // Shrink any sign/zero extends from 32 or smaller to larger than 32 if | |||
| 55102 | // there are sufficient sign bits. Only do this before legalize types to | |||
| 55103 | // avoid creating illegal types in truncate. | |||
| 55104 | if ((Index.getOpcode() == ISD::SIGN_EXTEND || | |||
| 55105 | Index.getOpcode() == ISD::ZERO_EXTEND) && | |||
| 55106 | IndexWidth > 32 && | |||
| 55107 | Index.getOperand(0).getScalarValueSizeInBits() <= 32 && | |||
| 55108 | DAG.ComputeNumSignBits(Index) > (IndexWidth - 32)) { | |||
| 55109 | EVT NewVT = Index.getValueType().changeVectorElementType(MVT::i32); | |||
| 55110 | Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index); | |||
| 55111 | return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); | |||
| 55112 | } | |||
| 55113 | } | |||
| 55114 | ||||
| 55115 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 55116 | EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout()); | |||
| 55117 | // Try to move splat constant adders from the index operand to the base | |||
| 55118 | // pointer operand. Taking care to multiply by the scale. We can only do | |||
| 55119 | // this when index element type is the same as the pointer type. | |||
| 55120 | // Otherwise we need to be sure the math doesn't wrap before the scale. | |||
| 55121 | if (Index.getOpcode() == ISD::ADD && | |||
| 55122 | Index.getValueType().getVectorElementType() == PtrVT && | |||
| 55123 | isa<ConstantSDNode>(Scale)) { | |||
| 55124 | uint64_t ScaleAmt = cast<ConstantSDNode>(Scale)->getZExtValue(); | |||
| 55125 | if (auto *BV = dyn_cast<BuildVectorSDNode>(Index.getOperand(1))) { | |||
| 55126 | BitVector UndefElts; | |||
| 55127 | if (ConstantSDNode *C = BV->getConstantSplatNode(&UndefElts)) { | |||
| 55128 | // FIXME: Allow non-constant? | |||
| 55129 | if (UndefElts.none()) { | |||
| 55130 | // Apply the scale. | |||
| 55131 | APInt Adder = C->getAPIntValue() * ScaleAmt; | |||
| 55132 | // Add it to the existing base. | |||
| 55133 | Base = DAG.getNode(ISD::ADD, DL, PtrVT, Base, | |||
| 55134 | DAG.getConstant(Adder, DL, PtrVT)); | |||
| 55135 | Index = Index.getOperand(0); | |||
| 55136 | return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); | |||
| 55137 | } | |||
| 55138 | } | |||
| 55139 | ||||
| 55140 | // It's also possible base is just a constant. In that case, just | |||
| 55141 | // replace it with 0 and move the displacement into the index. | |||
| 55142 | if (BV->isConstant() && isa<ConstantSDNode>(Base) && | |||
| 55143 | isOneConstant(Scale)) { | |||
| 55144 | SDValue Splat = DAG.getSplatBuildVector(Index.getValueType(), DL, Base); | |||
| 55145 | // Combine the constant build_vector and the constant base. | |||
| 55146 | Splat = DAG.getNode(ISD::ADD, DL, Index.getValueType(), | |||
| 55147 | Index.getOperand(1), Splat); | |||
| 55148 | // Add to the LHS of the original Index add. | |||
| 55149 | Index = DAG.getNode(ISD::ADD, DL, Index.getValueType(), | |||
| 55150 | Index.getOperand(0), Splat); | |||
| 55151 | Base = DAG.getConstant(0, DL, Base.getValueType()); | |||
| 55152 | return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); | |||
| 55153 | } | |||
| 55154 | } | |||
| 55155 | } | |||
| 55156 | ||||
| 55157 | if (DCI.isBeforeLegalizeOps()) { | |||
| 55158 | unsigned IndexWidth = Index.getScalarValueSizeInBits(); | |||
| 55159 | ||||
| 55160 | // Make sure the index is either i32 or i64 | |||
| 55161 | if (IndexWidth != 32 && IndexWidth != 64) { | |||
| 55162 | MVT EltVT = IndexWidth > 32 ? MVT::i64 : MVT::i32; | |||
| 55163 | EVT IndexVT = Index.getValueType().changeVectorElementType(EltVT); | |||
| 55164 | Index = DAG.getSExtOrTrunc(Index, DL, IndexVT); | |||
| 55165 | return rebuildGatherScatter(GorS, Index, Base, Scale, DAG); | |||
| 55166 | } | |||
| 55167 | } | |||
| 55168 | ||||
| 55169 | // With vector masks we only demand the upper bit of the mask. | |||
| 55170 | SDValue Mask = GorS->getMask(); | |||
| 55171 | if (Mask.getScalarValueSizeInBits() != 1) { | |||
| 55172 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 55173 | APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); | |||
| 55174 | if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) { | |||
| 55175 | if (N->getOpcode() != ISD::DELETED_NODE) | |||
| 55176 | DCI.AddToWorklist(N); | |||
| 55177 | return SDValue(N, 0); | |||
| 55178 | } | |||
| 55179 | } | |||
| 55180 | ||||
| 55181 | return SDValue(); | |||
| 55182 | } | |||
| 55183 | ||||
| 55184 | // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT | |||
| 55185 | static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, | |||
| 55186 | const X86Subtarget &Subtarget) { | |||
| 55187 | SDLoc DL(N); | |||
| 55188 | X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0)); | |||
| 55189 | SDValue EFLAGS = N->getOperand(1); | |||
| 55190 | ||||
| 55191 | // Try to simplify the EFLAGS and condition code operands. | |||
| 55192 | if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) | |||
| 55193 | return getSETCC(CC, Flags, DL, DAG); | |||
| 55194 | ||||
| 55195 | return SDValue(); | |||
| 55196 | } | |||
| 55197 | ||||
| 55198 | /// Optimize branch condition evaluation. | |||
| 55199 | static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, | |||
| 55200 | const X86Subtarget &Subtarget) { | |||
| 55201 | SDLoc DL(N); | |||
| 55202 | SDValue EFLAGS = N->getOperand(3); | |||
| 55203 | X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2)); | |||
| 55204 | ||||
| 55205 | // Try to simplify the EFLAGS and condition code operands. | |||
| 55206 | // Make sure to not keep references to operands, as combineSetCCEFLAGS can | |||
| 55207 | // RAUW them under us. | |||
| 55208 | if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { | |||
| 55209 | SDValue Cond = DAG.getTargetConstant(CC, DL, MVT::i8); | |||
| 55210 | return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), | |||
| 55211 | N->getOperand(1), Cond, Flags); | |||
| 55212 | } | |||
| 55213 | ||||
| 55214 | return SDValue(); | |||
| 55215 | } | |||
| 55216 | ||||
| 55217 | // TODO: Could we move this to DAGCombine? | |||
| 55218 | static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N, | |||
| 55219 | SelectionDAG &DAG) { | |||
| 55220 | // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane | |||
| 55221 | // to optimize away operation when it's from a constant. | |||
| 55222 | // | |||
| 55223 | // The general transformation is: | |||
| 55224 | // UNARYOP(AND(VECTOR_CMP(x,y), constant)) --> | |||
| 55225 | // AND(VECTOR_CMP(x,y), constant2) | |||
| 55226 | // constant2 = UNARYOP(constant) | |||
| 55227 | ||||
| 55228 | // Early exit if this isn't a vector operation, the operand of the | |||
| 55229 | // unary operation isn't a bitwise AND, or if the sizes of the operations | |||
| 55230 | // aren't the same. | |||
| 55231 | EVT VT = N->getValueType(0); | |||
| 55232 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 55233 | unsigned NumEltBits = VT.getScalarSizeInBits(); | |||
| 55234 | SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); | |||
| 55235 | if (!VT.isVector() || Op0.getOpcode() != ISD::AND || | |||
| 55236 | DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits || | |||
| 55237 | VT.getSizeInBits() != Op0.getValueSizeInBits()) | |||
| 55238 | return SDValue(); | |||
| 55239 | ||||
| 55240 | // Now check that the other operand of the AND is a constant. We could | |||
| 55241 | // make the transformation for non-constant splats as well, but it's unclear | |||
| 55242 | // that would be a benefit as it would not eliminate any operations, just | |||
| 55243 | // perform one more step in scalar code before moving to the vector unit. | |||
| 55244 | if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) { | |||
| 55245 | // Bail out if the vector isn't a constant. | |||
| 55246 | if (!BV->isConstant()) | |||
| 55247 | return SDValue(); | |||
| 55248 | ||||
| 55249 | // Everything checks out. Build up the new and improved node. | |||
| 55250 | SDLoc DL(N); | |||
| 55251 | EVT IntVT = BV->getValueType(0); | |||
| 55252 | // Create a new constant of the appropriate type for the transformed | |||
| 55253 | // DAG. | |||
| 55254 | SDValue SourceConst; | |||
| 55255 | if (IsStrict) | |||
| 55256 | SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other}, | |||
| 55257 | {N->getOperand(0), SDValue(BV, 0)}); | |||
| 55258 | else | |||
| 55259 | SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0)); | |||
| 55260 | // The AND node needs bitcasts to/from an integer vector type around it. | |||
| 55261 | SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst); | |||
| 55262 | SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0), | |||
| 55263 | MaskConst); | |||
| 55264 | SDValue Res = DAG.getBitcast(VT, NewAnd); | |||
| 55265 | if (IsStrict) | |||
| 55266 | return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL); | |||
| 55267 | return Res; | |||
| 55268 | } | |||
| 55269 | ||||
| 55270 | return SDValue(); | |||
| 55271 | } | |||
| 55272 | ||||
| 55273 | /// If we are converting a value to floating-point, try to replace scalar | |||
| 55274 | /// truncate of an extracted vector element with a bitcast. This tries to keep | |||
| 55275 | /// the sequence on XMM registers rather than moving between vector and GPRs. | |||
| 55276 | static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) { | |||
| 55277 | // TODO: This is currently only used by combineSIntToFP, but it is generalized | |||
| 55278 | // to allow being called by any similar cast opcode. | |||
| 55279 | // TODO: Consider merging this into lowering: vectorizeExtractedCast(). | |||
| 55280 | SDValue Trunc = N->getOperand(0); | |||
| 55281 | if (!Trunc.hasOneUse() || Trunc.getOpcode() != ISD::TRUNCATE) | |||
| 55282 | return SDValue(); | |||
| 55283 | ||||
| 55284 | SDValue ExtElt = Trunc.getOperand(0); | |||
| 55285 | if (!ExtElt.hasOneUse() || ExtElt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55286 | !isNullConstant(ExtElt.getOperand(1))) | |||
| 55287 | return SDValue(); | |||
| 55288 | ||||
| 55289 | EVT TruncVT = Trunc.getValueType(); | |||
| 55290 | EVT SrcVT = ExtElt.getValueType(); | |||
| 55291 | unsigned DestWidth = TruncVT.getSizeInBits(); | |||
| 55292 | unsigned SrcWidth = SrcVT.getSizeInBits(); | |||
| 55293 | if (SrcWidth % DestWidth != 0) | |||
| 55294 | return SDValue(); | |||
| 55295 | ||||
| 55296 | // inttofp (trunc (extelt X, 0)) --> inttofp (extelt (bitcast X), 0) | |||
| 55297 | EVT SrcVecVT = ExtElt.getOperand(0).getValueType(); | |||
| 55298 | unsigned VecWidth = SrcVecVT.getSizeInBits(); | |||
| 55299 | unsigned NumElts = VecWidth / DestWidth; | |||
| 55300 | EVT BitcastVT = EVT::getVectorVT(*DAG.getContext(), TruncVT, NumElts); | |||
| 55301 | SDValue BitcastVec = DAG.getBitcast(BitcastVT, ExtElt.getOperand(0)); | |||
| 55302 | SDLoc DL(N); | |||
| 55303 | SDValue NewExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TruncVT, | |||
| 55304 | BitcastVec, ExtElt.getOperand(1)); | |||
| 55305 | return DAG.getNode(N->getOpcode(), DL, N->getValueType(0), NewExtElt); | |||
| 55306 | } | |||
| 55307 | ||||
| 55308 | static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, | |||
| 55309 | const X86Subtarget &Subtarget) { | |||
| 55310 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 55311 | SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); | |||
| 55312 | EVT VT = N->getValueType(0); | |||
| 55313 | EVT InVT = Op0.getValueType(); | |||
| 55314 | ||||
| 55315 | // UINT_TO_FP(vXi1~15) -> UINT_TO_FP(ZEXT(vXi1~15 to vXi16)) | |||
| 55316 | // UINT_TO_FP(vXi17~31) -> UINT_TO_FP(ZEXT(vXi17~31 to vXi32)) | |||
| 55317 | // UINT_TO_FP(vXi33~63) -> UINT_TO_FP(ZEXT(vXi33~63 to vXi64)) | |||
| 55318 | if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { | |||
| 55319 | unsigned ScalarSize = InVT.getScalarSizeInBits(); | |||
| 55320 | if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64) | |||
| 55321 | return SDValue(); | |||
| 55322 | SDLoc dl(N); | |||
| 55323 | EVT DstVT = EVT::getVectorVT(*DAG.getContext(), | |||
| 55324 | ScalarSize < 16 ? MVT::i16 | |||
| 55325 | : ScalarSize < 32 ? MVT::i32 | |||
| 55326 | : MVT::i64, | |||
| 55327 | InVT.getVectorNumElements()); | |||
| 55328 | SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); | |||
| 55329 | if (IsStrict) | |||
| 55330 | return DAG.getNode(ISD::STRICT_UINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 55331 | {N->getOperand(0), P}); | |||
| 55332 | return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); | |||
| 55333 | } | |||
| 55334 | ||||
| 55335 | // UINT_TO_FP(vXi1) -> SINT_TO_FP(ZEXT(vXi1 to vXi32)) | |||
| 55336 | // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) | |||
| 55337 | // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) | |||
| 55338 | if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && | |||
| 55339 | VT.getScalarType() != MVT::f16) { | |||
| 55340 | SDLoc dl(N); | |||
| 55341 | EVT DstVT = InVT.changeVectorElementType(MVT::i32); | |||
| 55342 | SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); | |||
| 55343 | ||||
| 55344 | // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. | |||
| 55345 | if (IsStrict) | |||
| 55346 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 55347 | {N->getOperand(0), P}); | |||
| 55348 | return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); | |||
| 55349 | } | |||
| 55350 | ||||
| 55351 | // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't | |||
| 55352 | // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform | |||
| 55353 | // the optimization here. | |||
| 55354 | if (DAG.SignBitIsZero(Op0)) { | |||
| 55355 | if (IsStrict) | |||
| 55356 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other}, | |||
| 55357 | {N->getOperand(0), Op0}); | |||
| 55358 | return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0); | |||
| 55359 | } | |||
| 55360 | ||||
| 55361 | return SDValue(); | |||
| 55362 | } | |||
| 55363 | ||||
| 55364 | static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG, | |||
| 55365 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 55366 | const X86Subtarget &Subtarget) { | |||
| 55367 | // First try to optimize away the conversion entirely when it's | |||
| 55368 | // conditionally from a constant. Vectors only. | |||
| 55369 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 55370 | if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG)) | |||
| 55371 | return Res; | |||
| 55372 | ||||
| 55373 | // Now move on to more general possibilities. | |||
| 55374 | SDValue Op0 = N->getOperand(IsStrict ? 1 : 0); | |||
| 55375 | EVT VT = N->getValueType(0); | |||
| 55376 | EVT InVT = Op0.getValueType(); | |||
| 55377 | ||||
| 55378 | // SINT_TO_FP(vXi1~15) -> SINT_TO_FP(SEXT(vXi1~15 to vXi16)) | |||
| 55379 | // SINT_TO_FP(vXi17~31) -> SINT_TO_FP(SEXT(vXi17~31 to vXi32)) | |||
| 55380 | // SINT_TO_FP(vXi33~63) -> SINT_TO_FP(SEXT(vXi33~63 to vXi64)) | |||
| 55381 | if (InVT.isVector() && VT.getVectorElementType() == MVT::f16) { | |||
| 55382 | unsigned ScalarSize = InVT.getScalarSizeInBits(); | |||
| 55383 | if (ScalarSize == 16 || ScalarSize == 32 || ScalarSize >= 64) | |||
| 55384 | return SDValue(); | |||
| 55385 | SDLoc dl(N); | |||
| 55386 | EVT DstVT = EVT::getVectorVT(*DAG.getContext(), | |||
| 55387 | ScalarSize < 16 ? MVT::i16 | |||
| 55388 | : ScalarSize < 32 ? MVT::i32 | |||
| 55389 | : MVT::i64, | |||
| 55390 | InVT.getVectorNumElements()); | |||
| 55391 | SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); | |||
| 55392 | if (IsStrict) | |||
| 55393 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 55394 | {N->getOperand(0), P}); | |||
| 55395 | return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); | |||
| 55396 | } | |||
| 55397 | ||||
| 55398 | // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32)) | |||
| 55399 | // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32)) | |||
| 55400 | // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32)) | |||
| 55401 | if (InVT.isVector() && InVT.getScalarSizeInBits() < 32 && | |||
| 55402 | VT.getScalarType() != MVT::f16) { | |||
| 55403 | SDLoc dl(N); | |||
| 55404 | EVT DstVT = InVT.changeVectorElementType(MVT::i32); | |||
| 55405 | SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0); | |||
| 55406 | if (IsStrict) | |||
| 55407 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 55408 | {N->getOperand(0), P}); | |||
| 55409 | return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); | |||
| 55410 | } | |||
| 55411 | ||||
| 55412 | // Without AVX512DQ we only support i64 to float scalar conversion. For both | |||
| 55413 | // vectors and scalars, see if we know that the upper bits are all the sign | |||
| 55414 | // bit, in which case we can truncate the input to i32 and convert from that. | |||
| 55415 | if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) { | |||
| 55416 | unsigned BitWidth = InVT.getScalarSizeInBits(); | |||
| 55417 | unsigned NumSignBits = DAG.ComputeNumSignBits(Op0); | |||
| 55418 | if (NumSignBits >= (BitWidth - 31)) { | |||
| 55419 | EVT TruncVT = MVT::i32; | |||
| 55420 | if (InVT.isVector()) | |||
| 55421 | TruncVT = InVT.changeVectorElementType(TruncVT); | |||
| 55422 | SDLoc dl(N); | |||
| 55423 | if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) { | |||
| 55424 | SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0); | |||
| 55425 | if (IsStrict) | |||
| 55426 | return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other}, | |||
| 55427 | {N->getOperand(0), Trunc}); | |||
| 55428 | return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc); | |||
| 55429 | } | |||
| 55430 | // If we're after legalize and the type is v2i32 we need to shuffle and | |||
| 55431 | // use CVTSI2P. | |||
| 55432 | assert(InVT == MVT::v2i64 && "Unexpected VT!")(static_cast <bool> (InVT == MVT::v2i64 && "Unexpected VT!" ) ? void (0) : __assert_fail ("InVT == MVT::v2i64 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55432, __extension__ __PRETTY_FUNCTION__)); | |||
| 55433 | SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0); | |||
| 55434 | SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast, | |||
| 55435 | { 0, 2, -1, -1 }); | |||
| 55436 | if (IsStrict) | |||
| 55437 | return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other}, | |||
| 55438 | {N->getOperand(0), Shuf}); | |||
| 55439 | return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf); | |||
| 55440 | } | |||
| 55441 | } | |||
| 55442 | ||||
| 55443 | // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have | |||
| 55444 | // a 32-bit target where SSE doesn't support i64->FP operations. | |||
| 55445 | if (!Subtarget.useSoftFloat() && Subtarget.hasX87() && | |||
| 55446 | Op0.getOpcode() == ISD::LOAD) { | |||
| 55447 | LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode()); | |||
| 55448 | ||||
| 55449 | // This transformation is not supported if the result type is f16 or f128. | |||
| 55450 | if (VT == MVT::f16 || VT == MVT::f128) | |||
| 55451 | return SDValue(); | |||
| 55452 | ||||
| 55453 | // If we have AVX512DQ we can use packed conversion instructions unless | |||
| 55454 | // the VT is f80. | |||
| 55455 | if (Subtarget.hasDQI() && VT != MVT::f80) | |||
| 55456 | return SDValue(); | |||
| 55457 | ||||
| 55458 | if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) && | |||
| 55459 | Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) { | |||
| 55460 | std::pair<SDValue, SDValue> Tmp = | |||
| 55461 | Subtarget.getTargetLowering()->BuildFILD( | |||
| 55462 | VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), | |||
| 55463 | Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG); | |||
| 55464 | DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second); | |||
| 55465 | return Tmp.first; | |||
| 55466 | } | |||
| 55467 | } | |||
| 55468 | ||||
| 55469 | if (IsStrict) | |||
| 55470 | return SDValue(); | |||
| 55471 | ||||
| 55472 | if (SDValue V = combineToFPTruncExtElt(N, DAG)) | |||
| 55473 | return V; | |||
| 55474 | ||||
| 55475 | return SDValue(); | |||
| 55476 | } | |||
| 55477 | ||||
| 55478 | static bool needCarryOrOverflowFlag(SDValue Flags) { | |||
| 55479 | assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 && "Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55479, __extension__ __PRETTY_FUNCTION__)); | |||
| 55480 | ||||
| 55481 | for (const SDNode *User : Flags->uses()) { | |||
| 55482 | X86::CondCode CC; | |||
| 55483 | switch (User->getOpcode()) { | |||
| 55484 | default: | |||
| 55485 | // Be conservative. | |||
| 55486 | return true; | |||
| 55487 | case X86ISD::SETCC: | |||
| 55488 | case X86ISD::SETCC_CARRY: | |||
| 55489 | CC = (X86::CondCode)User->getConstantOperandVal(0); | |||
| 55490 | break; | |||
| 55491 | case X86ISD::BRCOND: | |||
| 55492 | case X86ISD::CMOV: | |||
| 55493 | CC = (X86::CondCode)User->getConstantOperandVal(2); | |||
| 55494 | break; | |||
| 55495 | } | |||
| 55496 | ||||
| 55497 | switch (CC) { | |||
| 55498 | default: break; | |||
| 55499 | case X86::COND_A: case X86::COND_AE: | |||
| 55500 | case X86::COND_B: case X86::COND_BE: | |||
| 55501 | case X86::COND_O: case X86::COND_NO: | |||
| 55502 | case X86::COND_G: case X86::COND_GE: | |||
| 55503 | case X86::COND_L: case X86::COND_LE: | |||
| 55504 | return true; | |||
| 55505 | } | |||
| 55506 | } | |||
| 55507 | ||||
| 55508 | return false; | |||
| 55509 | } | |||
| 55510 | ||||
| 55511 | static bool onlyZeroFlagUsed(SDValue Flags) { | |||
| 55512 | assert(Flags.getValueType() == MVT::i32 && "Unexpected VT!")(static_cast <bool> (Flags.getValueType() == MVT::i32 && "Unexpected VT!") ? void (0) : __assert_fail ("Flags.getValueType() == MVT::i32 && \"Unexpected VT!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55512, __extension__ __PRETTY_FUNCTION__)); | |||
| 55513 | ||||
| 55514 | for (const SDNode *User : Flags->uses()) { | |||
| 55515 | unsigned CCOpNo; | |||
| 55516 | switch (User->getOpcode()) { | |||
| 55517 | default: | |||
| 55518 | // Be conservative. | |||
| 55519 | return false; | |||
| 55520 | case X86ISD::SETCC: | |||
| 55521 | case X86ISD::SETCC_CARRY: | |||
| 55522 | CCOpNo = 0; | |||
| 55523 | break; | |||
| 55524 | case X86ISD::BRCOND: | |||
| 55525 | case X86ISD::CMOV: | |||
| 55526 | CCOpNo = 2; | |||
| 55527 | break; | |||
| 55528 | } | |||
| 55529 | ||||
| 55530 | X86::CondCode CC = (X86::CondCode)User->getConstantOperandVal(CCOpNo); | |||
| 55531 | if (CC != X86::COND_E && CC != X86::COND_NE) | |||
| 55532 | return false; | |||
| 55533 | } | |||
| 55534 | ||||
| 55535 | return true; | |||
| 55536 | } | |||
| 55537 | ||||
| 55538 | static SDValue combineCMP(SDNode *N, SelectionDAG &DAG) { | |||
| 55539 | // Only handle test patterns. | |||
| 55540 | if (!isNullConstant(N->getOperand(1))) | |||
| 55541 | return SDValue(); | |||
| 55542 | ||||
| 55543 | // If we have a CMP of a truncated binop, see if we can make a smaller binop | |||
| 55544 | // and use its flags directly. | |||
| 55545 | // TODO: Maybe we should try promoting compares that only use the zero flag | |||
| 55546 | // first if we can prove the upper bits with computeKnownBits? | |||
| 55547 | SDLoc dl(N); | |||
| 55548 | SDValue Op = N->getOperand(0); | |||
| 55549 | EVT VT = Op.getValueType(); | |||
| 55550 | ||||
| 55551 | // If we have a constant logical shift that's only used in a comparison | |||
| 55552 | // against zero turn it into an equivalent AND. This allows turning it into | |||
| 55553 | // a TEST instruction later. | |||
| 55554 | if ((Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) && | |||
| 55555 | Op.hasOneUse() && isa<ConstantSDNode>(Op.getOperand(1)) && | |||
| 55556 | onlyZeroFlagUsed(SDValue(N, 0))) { | |||
| 55557 | unsigned BitWidth = VT.getSizeInBits(); | |||
| 55558 | const APInt &ShAmt = Op.getConstantOperandAPInt(1); | |||
| 55559 | if (ShAmt.ult(BitWidth)) { // Avoid undefined shifts. | |||
| 55560 | unsigned MaskBits = BitWidth - ShAmt.getZExtValue(); | |||
| 55561 | APInt Mask = Op.getOpcode() == ISD::SRL | |||
| 55562 | ? APInt::getHighBitsSet(BitWidth, MaskBits) | |||
| 55563 | : APInt::getLowBitsSet(BitWidth, MaskBits); | |||
| 55564 | if (Mask.isSignedIntN(32)) { | |||
| 55565 | Op = DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), | |||
| 55566 | DAG.getConstant(Mask, dl, VT)); | |||
| 55567 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, | |||
| 55568 | DAG.getConstant(0, dl, VT)); | |||
| 55569 | } | |||
| 55570 | } | |||
| 55571 | } | |||
| 55572 | ||||
| 55573 | // Peek through any zero-extend if we're only testing for a zero result. | |||
| 55574 | if (Op.getOpcode() == ISD::ZERO_EXTEND && onlyZeroFlagUsed(SDValue(N, 0))) { | |||
| 55575 | SDValue Src = Op.getOperand(0); | |||
| 55576 | EVT SrcVT = Src.getValueType(); | |||
| 55577 | if (SrcVT.getScalarSizeInBits() >= 8 && | |||
| 55578 | DAG.getTargetLoweringInfo().isTypeLegal(SrcVT)) | |||
| 55579 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Src, | |||
| 55580 | DAG.getConstant(0, dl, SrcVT)); | |||
| 55581 | } | |||
| 55582 | ||||
| 55583 | // Look for a truncate. | |||
| 55584 | if (Op.getOpcode() != ISD::TRUNCATE) | |||
| 55585 | return SDValue(); | |||
| 55586 | ||||
| 55587 | SDValue Trunc = Op; | |||
| 55588 | Op = Op.getOperand(0); | |||
| 55589 | ||||
| 55590 | // See if we can compare with zero against the truncation source, | |||
| 55591 | // which should help using the Z flag from many ops. Only do this for | |||
| 55592 | // i32 truncated op to prevent partial-reg compares of promoted ops. | |||
| 55593 | EVT OpVT = Op.getValueType(); | |||
| 55594 | APInt UpperBits = | |||
| 55595 | APInt::getBitsSetFrom(OpVT.getSizeInBits(), VT.getSizeInBits()); | |||
| 55596 | if (OpVT == MVT::i32 && DAG.MaskedValueIsZero(Op, UpperBits) && | |||
| 55597 | onlyZeroFlagUsed(SDValue(N, 0))) { | |||
| 55598 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, | |||
| 55599 | DAG.getConstant(0, dl, OpVT)); | |||
| 55600 | } | |||
| 55601 | ||||
| 55602 | // After this the truncate and arithmetic op must have a single use. | |||
| 55603 | if (!Trunc.hasOneUse() || !Op.hasOneUse()) | |||
| 55604 | return SDValue(); | |||
| 55605 | ||||
| 55606 | unsigned NewOpc; | |||
| 55607 | switch (Op.getOpcode()) { | |||
| 55608 | default: return SDValue(); | |||
| 55609 | case ISD::AND: | |||
| 55610 | // Skip and with constant. We have special handling for and with immediate | |||
| 55611 | // during isel to generate test instructions. | |||
| 55612 | if (isa<ConstantSDNode>(Op.getOperand(1))) | |||
| 55613 | return SDValue(); | |||
| 55614 | NewOpc = X86ISD::AND; | |||
| 55615 | break; | |||
| 55616 | case ISD::OR: NewOpc = X86ISD::OR; break; | |||
| 55617 | case ISD::XOR: NewOpc = X86ISD::XOR; break; | |||
| 55618 | case ISD::ADD: | |||
| 55619 | // If the carry or overflow flag is used, we can't truncate. | |||
| 55620 | if (needCarryOrOverflowFlag(SDValue(N, 0))) | |||
| 55621 | return SDValue(); | |||
| 55622 | NewOpc = X86ISD::ADD; | |||
| 55623 | break; | |||
| 55624 | case ISD::SUB: | |||
| 55625 | // If the carry or overflow flag is used, we can't truncate. | |||
| 55626 | if (needCarryOrOverflowFlag(SDValue(N, 0))) | |||
| 55627 | return SDValue(); | |||
| 55628 | NewOpc = X86ISD::SUB; | |||
| 55629 | break; | |||
| 55630 | } | |||
| 55631 | ||||
| 55632 | // We found an op we can narrow. Truncate its inputs. | |||
| 55633 | SDValue Op0 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(0)); | |||
| 55634 | SDValue Op1 = DAG.getNode(ISD::TRUNCATE, dl, VT, Op.getOperand(1)); | |||
| 55635 | ||||
| 55636 | // Use a X86 specific opcode to avoid DAG combine messing with it. | |||
| 55637 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 55638 | Op = DAG.getNode(NewOpc, dl, VTs, Op0, Op1); | |||
| 55639 | ||||
| 55640 | // For AND, keep a CMP so that we can match the test pattern. | |||
| 55641 | if (NewOpc == X86ISD::AND) | |||
| 55642 | return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op, | |||
| 55643 | DAG.getConstant(0, dl, VT)); | |||
| 55644 | ||||
| 55645 | // Return the flags. | |||
| 55646 | return Op.getValue(1); | |||
| 55647 | } | |||
| 55648 | ||||
| 55649 | static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG, | |||
| 55650 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 55651 | assert((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) &&(static_cast <bool> ((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB" ) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__ __PRETTY_FUNCTION__)) | |||
| 55652 | "Expected X86ISD::ADD or X86ISD::SUB")(static_cast <bool> ((X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && "Expected X86ISD::ADD or X86ISD::SUB" ) ? void (0) : __assert_fail ("(X86ISD::ADD == N->getOpcode() || X86ISD::SUB == N->getOpcode()) && \"Expected X86ISD::ADD or X86ISD::SUB\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55652, __extension__ __PRETTY_FUNCTION__)); | |||
| 55653 | ||||
| 55654 | SDLoc DL(N); | |||
| 55655 | SDValue LHS = N->getOperand(0); | |||
| 55656 | SDValue RHS = N->getOperand(1); | |||
| 55657 | MVT VT = LHS.getSimpleValueType(); | |||
| 55658 | bool IsSub = X86ISD::SUB == N->getOpcode(); | |||
| 55659 | unsigned GenericOpc = IsSub ? ISD::SUB : ISD::ADD; | |||
| 55660 | ||||
| 55661 | // If we don't use the flag result, simplify back to a generic ADD/SUB. | |||
| 55662 | if (!N->hasAnyUseOfValue(1)) { | |||
| 55663 | SDValue Res = DAG.getNode(GenericOpc, DL, VT, LHS, RHS); | |||
| 55664 | return DAG.getMergeValues({Res, DAG.getConstant(0, DL, MVT::i32)}, DL); | |||
| 55665 | } | |||
| 55666 | ||||
| 55667 | // Fold any similar generic ADD/SUB opcodes to reuse this node. | |||
| 55668 | auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) { | |||
| 55669 | SDValue Ops[] = {N0, N1}; | |||
| 55670 | SDVTList VTs = DAG.getVTList(N->getValueType(0)); | |||
| 55671 | if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) { | |||
| 55672 | SDValue Op(N, 0); | |||
| 55673 | if (Negate) | |||
| 55674 | Op = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Op); | |||
| 55675 | DCI.CombineTo(GenericAddSub, Op); | |||
| 55676 | } | |||
| 55677 | }; | |||
| 55678 | MatchGeneric(LHS, RHS, false); | |||
| 55679 | MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode()); | |||
| 55680 | ||||
| 55681 | // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the | |||
| 55682 | // EFLAGS result doesn't change. | |||
| 55683 | return combineAddOrSubToADCOrSBB(IsSub, DL, VT, LHS, RHS, DAG, | |||
| 55684 | /*ZeroSecondOpOnly*/ true); | |||
| 55685 | } | |||
| 55686 | ||||
| 55687 | static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { | |||
| 55688 | SDValue LHS = N->getOperand(0); | |||
| 55689 | SDValue RHS = N->getOperand(1); | |||
| 55690 | SDValue BorrowIn = N->getOperand(2); | |||
| 55691 | ||||
| 55692 | if (SDValue Flags = combineCarryThroughADD(BorrowIn, DAG)) { | |||
| 55693 | MVT VT = N->getSimpleValueType(0); | |||
| 55694 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 55695 | return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs, LHS, RHS, Flags); | |||
| 55696 | } | |||
| 55697 | ||||
| 55698 | // Fold SBB(SUB(X,Y),0,Carry) -> SBB(X,Y,Carry) | |||
| 55699 | // iff the flag result is dead. | |||
| 55700 | if (LHS.getOpcode() == ISD::SUB && isNullConstant(RHS) && | |||
| 55701 | !N->hasAnyUseOfValue(1)) | |||
| 55702 | return DAG.getNode(X86ISD::SBB, SDLoc(N), N->getVTList(), LHS.getOperand(0), | |||
| 55703 | LHS.getOperand(1), BorrowIn); | |||
| 55704 | ||||
| 55705 | return SDValue(); | |||
| 55706 | } | |||
| 55707 | ||||
| 55708 | // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS | |||
| 55709 | static SDValue combineADC(SDNode *N, SelectionDAG &DAG, | |||
| 55710 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 55711 | SDValue LHS = N->getOperand(0); | |||
| 55712 | SDValue RHS = N->getOperand(1); | |||
| 55713 | SDValue CarryIn = N->getOperand(2); | |||
| 55714 | auto *LHSC = dyn_cast<ConstantSDNode>(LHS); | |||
| 55715 | auto *RHSC = dyn_cast<ConstantSDNode>(RHS); | |||
| 55716 | ||||
| 55717 | // Canonicalize constant to RHS. | |||
| 55718 | if (LHSC && !RHSC) | |||
| 55719 | return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), RHS, LHS, | |||
| 55720 | CarryIn); | |||
| 55721 | ||||
| 55722 | // If the LHS and RHS of the ADC node are zero, then it can't overflow and | |||
| 55723 | // the result is either zero or one (depending on the input carry bit). | |||
| 55724 | // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. | |||
| 55725 | if (LHSC && RHSC && LHSC->isZero() && RHSC->isZero() && | |||
| 55726 | // We don't have a good way to replace an EFLAGS use, so only do this when | |||
| 55727 | // dead right now. | |||
| 55728 | SDValue(N, 1).use_empty()) { | |||
| 55729 | SDLoc DL(N); | |||
| 55730 | EVT VT = N->getValueType(0); | |||
| 55731 | SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1)); | |||
| 55732 | SDValue Res1 = DAG.getNode( | |||
| 55733 | ISD::AND, DL, VT, | |||
| 55734 | DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, | |||
| 55735 | DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), CarryIn), | |||
| 55736 | DAG.getConstant(1, DL, VT)); | |||
| 55737 | return DCI.CombineTo(N, Res1, CarryOut); | |||
| 55738 | } | |||
| 55739 | ||||
| 55740 | // Fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry) | |||
| 55741 | // iff the flag result is dead. | |||
| 55742 | // TODO: Allow flag result if C1+C2 doesn't signed/unsigned overflow. | |||
| 55743 | if (LHSC && RHSC && !LHSC->isZero() && !N->hasAnyUseOfValue(1)) { | |||
| 55744 | SDLoc DL(N); | |||
| 55745 | APInt Sum = LHSC->getAPIntValue() + RHSC->getAPIntValue(); | |||
| 55746 | return DAG.getNode(X86ISD::ADC, DL, N->getVTList(), | |||
| 55747 | DAG.getConstant(0, DL, LHS.getValueType()), | |||
| 55748 | DAG.getConstant(Sum, DL, LHS.getValueType()), CarryIn); | |||
| 55749 | } | |||
| 55750 | ||||
| 55751 | if (SDValue Flags = combineCarryThroughADD(CarryIn, DAG)) { | |||
| 55752 | MVT VT = N->getSimpleValueType(0); | |||
| 55753 | SDVTList VTs = DAG.getVTList(VT, MVT::i32); | |||
| 55754 | return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs, LHS, RHS, Flags); | |||
| 55755 | } | |||
| 55756 | ||||
| 55757 | // Fold ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry) | |||
| 55758 | // iff the flag result is dead. | |||
| 55759 | if (LHS.getOpcode() == ISD::ADD && RHSC && RHSC->isZero() && | |||
| 55760 | !N->hasAnyUseOfValue(1)) | |||
| 55761 | return DAG.getNode(X86ISD::ADC, SDLoc(N), N->getVTList(), LHS.getOperand(0), | |||
| 55762 | LHS.getOperand(1), CarryIn); | |||
| 55763 | ||||
| 55764 | return SDValue(); | |||
| 55765 | } | |||
| 55766 | ||||
| 55767 | static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1, | |||
| 55768 | const SDLoc &DL, EVT VT, | |||
| 55769 | const X86Subtarget &Subtarget) { | |||
| 55770 | // Example of pattern we try to detect: | |||
| 55771 | // t := (v8i32 mul (sext (v8i16 x0), (sext (v8i16 x1)))) | |||
| 55772 | //(add (build_vector (extract_elt t, 0), | |||
| 55773 | // (extract_elt t, 2), | |||
| 55774 | // (extract_elt t, 4), | |||
| 55775 | // (extract_elt t, 6)), | |||
| 55776 | // (build_vector (extract_elt t, 1), | |||
| 55777 | // (extract_elt t, 3), | |||
| 55778 | // (extract_elt t, 5), | |||
| 55779 | // (extract_elt t, 7))) | |||
| 55780 | ||||
| 55781 | if (!Subtarget.hasSSE2()) | |||
| 55782 | return SDValue(); | |||
| 55783 | ||||
| 55784 | if (Op0.getOpcode() != ISD::BUILD_VECTOR || | |||
| 55785 | Op1.getOpcode() != ISD::BUILD_VECTOR) | |||
| 55786 | return SDValue(); | |||
| 55787 | ||||
| 55788 | if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || | |||
| 55789 | VT.getVectorNumElements() < 4 || | |||
| 55790 | !isPowerOf2_32(VT.getVectorNumElements())) | |||
| 55791 | return SDValue(); | |||
| 55792 | ||||
| 55793 | // Check if one of Op0,Op1 is of the form: | |||
| 55794 | // (build_vector (extract_elt Mul, 0), | |||
| 55795 | // (extract_elt Mul, 2), | |||
| 55796 | // (extract_elt Mul, 4), | |||
| 55797 | // ... | |||
| 55798 | // the other is of the form: | |||
| 55799 | // (build_vector (extract_elt Mul, 1), | |||
| 55800 | // (extract_elt Mul, 3), | |||
| 55801 | // (extract_elt Mul, 5), | |||
| 55802 | // ... | |||
| 55803 | // and identify Mul. | |||
| 55804 | SDValue Mul; | |||
| 55805 | for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; i += 2) { | |||
| 55806 | SDValue Op0L = Op0->getOperand(i), Op1L = Op1->getOperand(i), | |||
| 55807 | Op0H = Op0->getOperand(i + 1), Op1H = Op1->getOperand(i + 1); | |||
| 55808 | // TODO: Be more tolerant to undefs. | |||
| 55809 | if (Op0L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55810 | Op1L.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55811 | Op0H.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55812 | Op1H.getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 55813 | return SDValue(); | |||
| 55814 | auto *Const0L = dyn_cast<ConstantSDNode>(Op0L->getOperand(1)); | |||
| 55815 | auto *Const1L = dyn_cast<ConstantSDNode>(Op1L->getOperand(1)); | |||
| 55816 | auto *Const0H = dyn_cast<ConstantSDNode>(Op0H->getOperand(1)); | |||
| 55817 | auto *Const1H = dyn_cast<ConstantSDNode>(Op1H->getOperand(1)); | |||
| 55818 | if (!Const0L || !Const1L || !Const0H || !Const1H) | |||
| 55819 | return SDValue(); | |||
| 55820 | unsigned Idx0L = Const0L->getZExtValue(), Idx1L = Const1L->getZExtValue(), | |||
| 55821 | Idx0H = Const0H->getZExtValue(), Idx1H = Const1H->getZExtValue(); | |||
| 55822 | // Commutativity of mul allows factors of a product to reorder. | |||
| 55823 | if (Idx0L > Idx1L) | |||
| 55824 | std::swap(Idx0L, Idx1L); | |||
| 55825 | if (Idx0H > Idx1H) | |||
| 55826 | std::swap(Idx0H, Idx1H); | |||
| 55827 | // Commutativity of add allows pairs of factors to reorder. | |||
| 55828 | if (Idx0L > Idx0H) { | |||
| 55829 | std::swap(Idx0L, Idx0H); | |||
| 55830 | std::swap(Idx1L, Idx1H); | |||
| 55831 | } | |||
| 55832 | if (Idx0L != 2 * i || Idx1L != 2 * i + 1 || Idx0H != 2 * i + 2 || | |||
| 55833 | Idx1H != 2 * i + 3) | |||
| 55834 | return SDValue(); | |||
| 55835 | if (!Mul) { | |||
| 55836 | // First time an extract_elt's source vector is visited. Must be a MUL | |||
| 55837 | // with 2X number of vector elements than the BUILD_VECTOR. | |||
| 55838 | // Both extracts must be from same MUL. | |||
| 55839 | Mul = Op0L->getOperand(0); | |||
| 55840 | if (Mul->getOpcode() != ISD::MUL || | |||
| 55841 | Mul.getValueType().getVectorNumElements() != 2 * e) | |||
| 55842 | return SDValue(); | |||
| 55843 | } | |||
| 55844 | // Check that the extract is from the same MUL previously seen. | |||
| 55845 | if (Mul != Op0L->getOperand(0) || Mul != Op1L->getOperand(0) || | |||
| 55846 | Mul != Op0H->getOperand(0) || Mul != Op1H->getOperand(0)) | |||
| 55847 | return SDValue(); | |||
| 55848 | } | |||
| 55849 | ||||
| 55850 | // Check if the Mul source can be safely shrunk. | |||
| 55851 | ShrinkMode Mode; | |||
| 55852 | if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || | |||
| 55853 | Mode == ShrinkMode::MULU16) | |||
| 55854 | return SDValue(); | |||
| 55855 | ||||
| 55856 | EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, | |||
| 55857 | VT.getVectorNumElements() * 2); | |||
| 55858 | SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0)); | |||
| 55859 | SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1)); | |||
| 55860 | ||||
| 55861 | auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 55862 | ArrayRef<SDValue> Ops) { | |||
| 55863 | EVT InVT = Ops[0].getValueType(); | |||
| 55864 | assert(InVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (InVT == Ops[1].getValueType() && "Operands' types mismatch") ? void (0) : __assert_fail ("InVT == Ops[1].getValueType() && \"Operands' types mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55864, __extension__ __PRETTY_FUNCTION__)); | |||
| 55865 | EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, | |||
| 55866 | InVT.getVectorNumElements() / 2); | |||
| 55867 | return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); | |||
| 55868 | }; | |||
| 55869 | return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder); | |||
| 55870 | } | |||
| 55871 | ||||
| 55872 | // Attempt to turn this pattern into PMADDWD. | |||
| 55873 | // (add (mul (sext (build_vector)), (sext (build_vector))), | |||
| 55874 | // (mul (sext (build_vector)), (sext (build_vector))) | |||
| 55875 | static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1, | |||
| 55876 | const SDLoc &DL, EVT VT, | |||
| 55877 | const X86Subtarget &Subtarget) { | |||
| 55878 | if (!Subtarget.hasSSE2()) | |||
| 55879 | return SDValue(); | |||
| 55880 | ||||
| 55881 | if (N0.getOpcode() != ISD::MUL || N1.getOpcode() != ISD::MUL) | |||
| 55882 | return SDValue(); | |||
| 55883 | ||||
| 55884 | if (!VT.isVector() || VT.getVectorElementType() != MVT::i32 || | |||
| 55885 | VT.getVectorNumElements() < 4 || | |||
| 55886 | !isPowerOf2_32(VT.getVectorNumElements())) | |||
| 55887 | return SDValue(); | |||
| 55888 | ||||
| 55889 | SDValue N00 = N0.getOperand(0); | |||
| 55890 | SDValue N01 = N0.getOperand(1); | |||
| 55891 | SDValue N10 = N1.getOperand(0); | |||
| 55892 | SDValue N11 = N1.getOperand(1); | |||
| 55893 | ||||
| 55894 | // All inputs need to be sign extends. | |||
| 55895 | // TODO: Support ZERO_EXTEND from known positive? | |||
| 55896 | if (N00.getOpcode() != ISD::SIGN_EXTEND || | |||
| 55897 | N01.getOpcode() != ISD::SIGN_EXTEND || | |||
| 55898 | N10.getOpcode() != ISD::SIGN_EXTEND || | |||
| 55899 | N11.getOpcode() != ISD::SIGN_EXTEND) | |||
| 55900 | return SDValue(); | |||
| 55901 | ||||
| 55902 | // Peek through the extends. | |||
| 55903 | N00 = N00.getOperand(0); | |||
| 55904 | N01 = N01.getOperand(0); | |||
| 55905 | N10 = N10.getOperand(0); | |||
| 55906 | N11 = N11.getOperand(0); | |||
| 55907 | ||||
| 55908 | // Must be extending from vXi16. | |||
| 55909 | EVT InVT = N00.getValueType(); | |||
| 55910 | if (InVT.getVectorElementType() != MVT::i16 || N01.getValueType() != InVT || | |||
| 55911 | N10.getValueType() != InVT || N11.getValueType() != InVT) | |||
| 55912 | return SDValue(); | |||
| 55913 | ||||
| 55914 | // All inputs should be build_vectors. | |||
| 55915 | if (N00.getOpcode() != ISD::BUILD_VECTOR || | |||
| 55916 | N01.getOpcode() != ISD::BUILD_VECTOR || | |||
| 55917 | N10.getOpcode() != ISD::BUILD_VECTOR || | |||
| 55918 | N11.getOpcode() != ISD::BUILD_VECTOR) | |||
| 55919 | return SDValue(); | |||
| 55920 | ||||
| 55921 | // For each element, we need to ensure we have an odd element from one vector | |||
| 55922 | // multiplied by the odd element of another vector and the even element from | |||
| 55923 | // one of the same vectors being multiplied by the even element from the | |||
| 55924 | // other vector. So we need to make sure for each element i, this operator | |||
| 55925 | // is being performed: | |||
| 55926 | // A[2 * i] * B[2 * i] + A[2 * i + 1] * B[2 * i + 1] | |||
| 55927 | SDValue In0, In1; | |||
| 55928 | for (unsigned i = 0; i != N00.getNumOperands(); ++i) { | |||
| 55929 | SDValue N00Elt = N00.getOperand(i); | |||
| 55930 | SDValue N01Elt = N01.getOperand(i); | |||
| 55931 | SDValue N10Elt = N10.getOperand(i); | |||
| 55932 | SDValue N11Elt = N11.getOperand(i); | |||
| 55933 | // TODO: Be more tolerant to undefs. | |||
| 55934 | if (N00Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55935 | N01Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55936 | N10Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT || | |||
| 55937 | N11Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT) | |||
| 55938 | return SDValue(); | |||
| 55939 | auto *ConstN00Elt = dyn_cast<ConstantSDNode>(N00Elt.getOperand(1)); | |||
| 55940 | auto *ConstN01Elt = dyn_cast<ConstantSDNode>(N01Elt.getOperand(1)); | |||
| 55941 | auto *ConstN10Elt = dyn_cast<ConstantSDNode>(N10Elt.getOperand(1)); | |||
| 55942 | auto *ConstN11Elt = dyn_cast<ConstantSDNode>(N11Elt.getOperand(1)); | |||
| 55943 | if (!ConstN00Elt || !ConstN01Elt || !ConstN10Elt || !ConstN11Elt) | |||
| 55944 | return SDValue(); | |||
| 55945 | unsigned IdxN00 = ConstN00Elt->getZExtValue(); | |||
| 55946 | unsigned IdxN01 = ConstN01Elt->getZExtValue(); | |||
| 55947 | unsigned IdxN10 = ConstN10Elt->getZExtValue(); | |||
| 55948 | unsigned IdxN11 = ConstN11Elt->getZExtValue(); | |||
| 55949 | // Add is commutative so indices can be reordered. | |||
| 55950 | if (IdxN00 > IdxN10) { | |||
| 55951 | std::swap(IdxN00, IdxN10); | |||
| 55952 | std::swap(IdxN01, IdxN11); | |||
| 55953 | } | |||
| 55954 | // N0 indices be the even element. N1 indices must be the next odd element. | |||
| 55955 | if (IdxN00 != 2 * i || IdxN10 != 2 * i + 1 || | |||
| 55956 | IdxN01 != 2 * i || IdxN11 != 2 * i + 1) | |||
| 55957 | return SDValue(); | |||
| 55958 | SDValue N00In = N00Elt.getOperand(0); | |||
| 55959 | SDValue N01In = N01Elt.getOperand(0); | |||
| 55960 | SDValue N10In = N10Elt.getOperand(0); | |||
| 55961 | SDValue N11In = N11Elt.getOperand(0); | |||
| 55962 | ||||
| 55963 | // First time we find an input capture it. | |||
| 55964 | if (!In0) { | |||
| 55965 | In0 = N00In; | |||
| 55966 | In1 = N01In; | |||
| 55967 | ||||
| 55968 | // The input vectors must be at least as wide as the output. | |||
| 55969 | // If they are larger than the output, we extract subvector below. | |||
| 55970 | if (In0.getValueSizeInBits() < VT.getSizeInBits() || | |||
| 55971 | In1.getValueSizeInBits() < VT.getSizeInBits()) | |||
| 55972 | return SDValue(); | |||
| 55973 | } | |||
| 55974 | // Mul is commutative so the input vectors can be in any order. | |||
| 55975 | // Canonicalize to make the compares easier. | |||
| 55976 | if (In0 != N00In) | |||
| 55977 | std::swap(N00In, N01In); | |||
| 55978 | if (In0 != N10In) | |||
| 55979 | std::swap(N10In, N11In); | |||
| 55980 | if (In0 != N00In || In1 != N01In || In0 != N10In || In1 != N11In) | |||
| 55981 | return SDValue(); | |||
| 55982 | } | |||
| 55983 | ||||
| 55984 | auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL, | |||
| 55985 | ArrayRef<SDValue> Ops) { | |||
| 55986 | EVT OpVT = Ops[0].getValueType(); | |||
| 55987 | assert(OpVT.getScalarType() == MVT::i16 &&(static_cast <bool> (OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type") ? void (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__ __PRETTY_FUNCTION__)) | |||
| 55988 | "Unexpected scalar element type")(static_cast <bool> (OpVT.getScalarType() == MVT::i16 && "Unexpected scalar element type") ? void (0) : __assert_fail ("OpVT.getScalarType() == MVT::i16 && \"Unexpected scalar element type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55988, __extension__ __PRETTY_FUNCTION__)); | |||
| 55989 | assert(OpVT == Ops[1].getValueType() && "Operands' types mismatch")(static_cast <bool> (OpVT == Ops[1].getValueType() && "Operands' types mismatch") ? void (0) : __assert_fail ("OpVT == Ops[1].getValueType() && \"Operands' types mismatch\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 55989, __extension__ __PRETTY_FUNCTION__)); | |||
| 55990 | EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, | |||
| 55991 | OpVT.getVectorNumElements() / 2); | |||
| 55992 | return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]); | |||
| 55993 | }; | |||
| 55994 | ||||
| 55995 | // If the output is narrower than an input, extract the low part of the input | |||
| 55996 | // vector. | |||
| 55997 | EVT OutVT16 = EVT::getVectorVT(*DAG.getContext(), MVT::i16, | |||
| 55998 | VT.getVectorNumElements() * 2); | |||
| 55999 | if (OutVT16.bitsLT(In0.getValueType())) { | |||
| 56000 | In0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In0, | |||
| 56001 | DAG.getIntPtrConstant(0, DL)); | |||
| 56002 | } | |||
| 56003 | if (OutVT16.bitsLT(In1.getValueType())) { | |||
| 56004 | In1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT16, In1, | |||
| 56005 | DAG.getIntPtrConstant(0, DL)); | |||
| 56006 | } | |||
| 56007 | return SplitOpsAndApply(DAG, Subtarget, DL, VT, { In0, In1 }, | |||
| 56008 | PMADDBuilder); | |||
| 56009 | } | |||
| 56010 | ||||
| 56011 | // ADD(VPMADDWD(X,Y),VPMADDWD(Z,W)) -> VPMADDWD(SHUFFLE(X,Z), SHUFFLE(Y,W)) | |||
| 56012 | // If upper element in each pair of both VPMADDWD are zero then we can merge | |||
| 56013 | // the operand elements and use the implicit add of VPMADDWD. | |||
| 56014 | // TODO: Add support for VPMADDUBSW (which isn't commutable). | |||
| 56015 | static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1, | |||
| 56016 | const SDLoc &DL, EVT VT) { | |||
| 56017 | if (N0.getOpcode() != N1.getOpcode() || N0.getOpcode() != X86ISD::VPMADDWD) | |||
| 56018 | return SDValue(); | |||
| 56019 | ||||
| 56020 | // TODO: Add 256/512-bit support once VPMADDWD combines with shuffles. | |||
| 56021 | if (VT.getSizeInBits() > 128) | |||
| 56022 | return SDValue(); | |||
| 56023 | ||||
| 56024 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 56025 | MVT OpVT = N0.getOperand(0).getSimpleValueType(); | |||
| 56026 | APInt DemandedBits = APInt::getAllOnes(OpVT.getScalarSizeInBits()); | |||
| 56027 | APInt DemandedHiElts = APInt::getSplat(2 * NumElts, APInt(2, 2)); | |||
| 56028 | ||||
| 56029 | bool Op0HiZero = | |||
| 56030 | DAG.MaskedValueIsZero(N0.getOperand(0), DemandedBits, DemandedHiElts) || | |||
| 56031 | DAG.MaskedValueIsZero(N0.getOperand(1), DemandedBits, DemandedHiElts); | |||
| 56032 | bool Op1HiZero = | |||
| 56033 | DAG.MaskedValueIsZero(N1.getOperand(0), DemandedBits, DemandedHiElts) || | |||
| 56034 | DAG.MaskedValueIsZero(N1.getOperand(1), DemandedBits, DemandedHiElts); | |||
| 56035 | ||||
| 56036 | // TODO: Check for zero lower elements once we have actual codegen that | |||
| 56037 | // creates them. | |||
| 56038 | if (!Op0HiZero || !Op1HiZero) | |||
| 56039 | return SDValue(); | |||
| 56040 | ||||
| 56041 | // Create a shuffle mask packing the lower elements from each VPMADDWD. | |||
| 56042 | SmallVector<int> Mask; | |||
| 56043 | for (int i = 0; i != (int)NumElts; ++i) { | |||
| 56044 | Mask.push_back(2 * i); | |||
| 56045 | Mask.push_back(2 * (i + NumElts)); | |||
| 56046 | } | |||
| 56047 | ||||
| 56048 | SDValue LHS = | |||
| 56049 | DAG.getVectorShuffle(OpVT, DL, N0.getOperand(0), N1.getOperand(0), Mask); | |||
| 56050 | SDValue RHS = | |||
| 56051 | DAG.getVectorShuffle(OpVT, DL, N0.getOperand(1), N1.getOperand(1), Mask); | |||
| 56052 | return DAG.getNode(X86ISD::VPMADDWD, DL, VT, LHS, RHS); | |||
| 56053 | } | |||
| 56054 | ||||
| 56055 | /// CMOV of constants requires materializing constant operands in registers. | |||
| 56056 | /// Try to fold those constants into an 'add' instruction to reduce instruction | |||
| 56057 | /// count. We do this with CMOV rather the generic 'select' because there are | |||
| 56058 | /// earlier folds that may be used to turn select-of-constants into logic hacks. | |||
| 56059 | static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG, | |||
| 56060 | const X86Subtarget &Subtarget) { | |||
| 56061 | // If an operand is zero, add-of-0 gets simplified away, so that's clearly | |||
| 56062 | // better because we eliminate 1-2 instructions. This transform is still | |||
| 56063 | // an improvement without zero operands because we trade 2 move constants and | |||
| 56064 | // 1 add for 2 adds (LEA) as long as the constants can be represented as | |||
| 56065 | // immediate asm operands (fit in 32-bits). | |||
| 56066 | auto isSuitableCmov = [](SDValue V) { | |||
| 56067 | if (V.getOpcode() != X86ISD::CMOV || !V.hasOneUse()) | |||
| 56068 | return false; | |||
| 56069 | if (!isa<ConstantSDNode>(V.getOperand(0)) || | |||
| 56070 | !isa<ConstantSDNode>(V.getOperand(1))) | |||
| 56071 | return false; | |||
| 56072 | return isNullConstant(V.getOperand(0)) || isNullConstant(V.getOperand(1)) || | |||
| 56073 | (V.getConstantOperandAPInt(0).isSignedIntN(32) && | |||
| 56074 | V.getConstantOperandAPInt(1).isSignedIntN(32)); | |||
| 56075 | }; | |||
| 56076 | ||||
| 56077 | // Match an appropriate CMOV as the first operand of the add. | |||
| 56078 | SDValue Cmov = N->getOperand(0); | |||
| 56079 | SDValue OtherOp = N->getOperand(1); | |||
| 56080 | if (!isSuitableCmov(Cmov)) | |||
| 56081 | std::swap(Cmov, OtherOp); | |||
| 56082 | if (!isSuitableCmov(Cmov)) | |||
| 56083 | return SDValue(); | |||
| 56084 | ||||
| 56085 | // Don't remove a load folding opportunity for the add. That would neutralize | |||
| 56086 | // any improvements from removing constant materializations. | |||
| 56087 | if (X86::mayFoldLoad(OtherOp, Subtarget)) | |||
| 56088 | return SDValue(); | |||
| 56089 | ||||
| 56090 | EVT VT = N->getValueType(0); | |||
| 56091 | SDLoc DL(N); | |||
| 56092 | SDValue FalseOp = Cmov.getOperand(0); | |||
| 56093 | SDValue TrueOp = Cmov.getOperand(1); | |||
| 56094 | ||||
| 56095 | // We will push the add through the select, but we can potentially do better | |||
| 56096 | // if we know there is another add in the sequence and this is pointer math. | |||
| 56097 | // In that case, we can absorb an add into the trailing memory op and avoid | |||
| 56098 | // a 3-operand LEA which is likely slower than a 2-operand LEA. | |||
| 56099 | // TODO: If target has "slow3OpsLEA", do this even without the trailing memop? | |||
| 56100 | if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() && | |||
| 56101 | !isa<ConstantSDNode>(OtherOp.getOperand(0)) && | |||
| 56102 | all_of(N->uses(), [&](SDNode *Use) { | |||
| 56103 | auto *MemNode = dyn_cast<MemSDNode>(Use); | |||
| 56104 | return MemNode && MemNode->getBasePtr().getNode() == N; | |||
| 56105 | })) { | |||
| 56106 | // add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y | |||
| 56107 | // TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but | |||
| 56108 | // it is possible that choosing op1 might be better. | |||
| 56109 | SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1); | |||
| 56110 | FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp); | |||
| 56111 | TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp); | |||
| 56112 | Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, | |||
| 56113 | Cmov.getOperand(2), Cmov.getOperand(3)); | |||
| 56114 | return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y); | |||
| 56115 | } | |||
| 56116 | ||||
| 56117 | // add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2) | |||
| 56118 | FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp); | |||
| 56119 | TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp); | |||
| 56120 | return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2), | |||
| 56121 | Cmov.getOperand(3)); | |||
| 56122 | } | |||
| 56123 | ||||
| 56124 | static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, | |||
| 56125 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 56126 | const X86Subtarget &Subtarget) { | |||
| 56127 | EVT VT = N->getValueType(0); | |||
| 56128 | SDValue Op0 = N->getOperand(0); | |||
| 56129 | SDValue Op1 = N->getOperand(1); | |||
| 56130 | SDLoc DL(N); | |||
| 56131 | ||||
| 56132 | if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget)) | |||
| 56133 | return Select; | |||
| 56134 | ||||
| 56135 | if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget)) | |||
| 56136 | return MAdd; | |||
| 56137 | if (SDValue MAdd = matchPMADDWD_2(DAG, Op0, Op1, DL, VT, Subtarget)) | |||
| 56138 | return MAdd; | |||
| 56139 | if (SDValue MAdd = combineAddOfPMADDWD(DAG, Op0, Op1, DL, VT)) | |||
| 56140 | return MAdd; | |||
| 56141 | ||||
| 56142 | // Try to synthesize horizontal adds from adds of shuffles. | |||
| 56143 | if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) | |||
| 56144 | return V; | |||
| 56145 | ||||
| 56146 | // If vectors of i1 are legal, turn (add (zext (vXi1 X)), Y) into | |||
| 56147 | // (sub Y, (sext (vXi1 X))). | |||
| 56148 | // FIXME: We have the (sub Y, (zext (vXi1 X))) -> (add (sext (vXi1 X)), Y) in | |||
| 56149 | // generic DAG combine without a legal type check, but adding this there | |||
| 56150 | // caused regressions. | |||
| 56151 | if (VT.isVector()) { | |||
| 56152 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 56153 | if (Op0.getOpcode() == ISD::ZERO_EXTEND && | |||
| 56154 | Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && | |||
| 56155 | TLI.isTypeLegal(Op0.getOperand(0).getValueType())) { | |||
| 56156 | SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op0.getOperand(0)); | |||
| 56157 | return DAG.getNode(ISD::SUB, DL, VT, Op1, SExt); | |||
| 56158 | } | |||
| 56159 | ||||
| 56160 | if (Op1.getOpcode() == ISD::ZERO_EXTEND && | |||
| 56161 | Op1.getOperand(0).getValueType().getVectorElementType() == MVT::i1 && | |||
| 56162 | TLI.isTypeLegal(Op1.getOperand(0).getValueType())) { | |||
| 56163 | SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op1.getOperand(0)); | |||
| 56164 | return DAG.getNode(ISD::SUB, DL, VT, Op0, SExt); | |||
| 56165 | } | |||
| 56166 | } | |||
| 56167 | ||||
| 56168 | // Fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W) | |||
| 56169 | if (Op0.getOpcode() == X86ISD::ADC && Op0->hasOneUse() && | |||
| 56170 | X86::isZeroNode(Op0.getOperand(1))) { | |||
| 56171 | assert(!Op0->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op0->hasAnyUseOfValue(1) && "Overflow bit in use") ? void (0) : __assert_fail ("!Op0->hasAnyUseOfValue(1) && \"Overflow bit in use\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 56171, __extension__ __PRETTY_FUNCTION__)); | |||
| 56172 | return DAG.getNode(X86ISD::ADC, SDLoc(Op0), Op0->getVTList(), Op1, | |||
| 56173 | Op0.getOperand(0), Op0.getOperand(2)); | |||
| 56174 | } | |||
| 56175 | ||||
| 56176 | return combineAddOrSubToADCOrSBB(N, DAG); | |||
| 56177 | } | |||
| 56178 | ||||
| 56179 | // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov | |||
| 56180 | // condition comes from the subtract node that produced -X. This matches the | |||
| 56181 | // cmov expansion for absolute value. By swapping the operands we convert abs | |||
| 56182 | // to nabs. | |||
| 56183 | static SDValue combineSubABS(SDNode *N, SelectionDAG &DAG) { | |||
| 56184 | SDValue N0 = N->getOperand(0); | |||
| 56185 | SDValue N1 = N->getOperand(1); | |||
| 56186 | ||||
| 56187 | if (N1.getOpcode() != X86ISD::CMOV || !N1.hasOneUse()) | |||
| 56188 | return SDValue(); | |||
| 56189 | ||||
| 56190 | X86::CondCode CC = (X86::CondCode)N1.getConstantOperandVal(2); | |||
| 56191 | if (CC != X86::COND_S && CC != X86::COND_NS) | |||
| 56192 | return SDValue(); | |||
| 56193 | ||||
| 56194 | // Condition should come from a negate operation. | |||
| 56195 | SDValue Cond = N1.getOperand(3); | |||
| 56196 | if (Cond.getOpcode() != X86ISD::SUB || !isNullConstant(Cond.getOperand(0))) | |||
| 56197 | return SDValue(); | |||
| 56198 | assert(Cond.getResNo() == 1 && "Unexpected result number")(static_cast <bool> (Cond.getResNo() == 1 && "Unexpected result number" ) ? void (0) : __assert_fail ("Cond.getResNo() == 1 && \"Unexpected result number\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 56198, __extension__ __PRETTY_FUNCTION__)); | |||
| 56199 | ||||
| 56200 | // Get the X and -X from the negate. | |||
| 56201 | SDValue NegX = Cond.getValue(0); | |||
| 56202 | SDValue X = Cond.getOperand(1); | |||
| 56203 | ||||
| 56204 | SDValue FalseOp = N1.getOperand(0); | |||
| 56205 | SDValue TrueOp = N1.getOperand(1); | |||
| 56206 | ||||
| 56207 | // Cmov operands should be X and NegX. Order doesn't matter. | |||
| 56208 | if (!(TrueOp == X && FalseOp == NegX) && !(TrueOp == NegX && FalseOp == X)) | |||
| 56209 | return SDValue(); | |||
| 56210 | ||||
| 56211 | // Build a new CMOV with the operands swapped. | |||
| 56212 | SDLoc DL(N); | |||
| 56213 | MVT VT = N->getSimpleValueType(0); | |||
| 56214 | SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, TrueOp, FalseOp, | |||
| 56215 | N1.getOperand(2), Cond); | |||
| 56216 | // Convert sub to add. | |||
| 56217 | return DAG.getNode(ISD::ADD, DL, VT, N0, Cmov); | |||
| 56218 | } | |||
| 56219 | ||||
| 56220 | static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) { | |||
| 56221 | SDValue Op0 = N->getOperand(0); | |||
| 56222 | SDValue Op1 = N->getOperand(1); | |||
| 56223 | ||||
| 56224 | // (sub C (zero_extend (setcc))) | |||
| 56225 | // => | |||
| 56226 | // (add (zero_extend (setcc inverted) C-1)) if C is a nonzero immediate | |||
| 56227 | // Don't disturb (sub 0 setcc), which is easily done with neg. | |||
| 56228 | EVT VT = N->getValueType(0); | |||
| 56229 | auto *Op0C = dyn_cast<ConstantSDNode>(Op0); | |||
| 56230 | if (Op1.getOpcode() == ISD::ZERO_EXTEND && Op1.hasOneUse() && Op0C && | |||
| 56231 | !Op0C->isZero() && Op1.getOperand(0).getOpcode() == X86ISD::SETCC && | |||
| 56232 | Op1.getOperand(0).hasOneUse()) { | |||
| 56233 | SDValue SetCC = Op1.getOperand(0); | |||
| 56234 | X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0); | |||
| 56235 | X86::CondCode NewCC = X86::GetOppositeBranchCondition(CC); | |||
| 56236 | uint64_t NewImm = Op0C->getZExtValue() - 1; | |||
| 56237 | SDLoc DL(Op1); | |||
| 56238 | SDValue NewSetCC = getSETCC(NewCC, SetCC.getOperand(1), DL, DAG); | |||
| 56239 | NewSetCC = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NewSetCC); | |||
| 56240 | return DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(VT, VT), NewSetCC, | |||
| 56241 | DAG.getConstant(NewImm, DL, VT)); | |||
| 56242 | } | |||
| 56243 | ||||
| 56244 | return SDValue(); | |||
| 56245 | } | |||
| 56246 | ||||
| 56247 | static SDValue combineSub(SDNode *N, SelectionDAG &DAG, | |||
| 56248 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 56249 | const X86Subtarget &Subtarget) { | |||
| 56250 | SDValue Op0 = N->getOperand(0); | |||
| 56251 | SDValue Op1 = N->getOperand(1); | |||
| 56252 | ||||
| 56253 | // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt. | |||
| 56254 | auto IsNonOpaqueConstant = [&](SDValue Op) { | |||
| 56255 | if (SDNode *C = DAG.isConstantIntBuildVectorOrConstantInt(Op)) { | |||
| 56256 | if (auto *Cst = dyn_cast<ConstantSDNode>(C)) | |||
| 56257 | return !Cst->isOpaque(); | |||
| 56258 | return true; | |||
| 56259 | } | |||
| 56260 | return false; | |||
| 56261 | }; | |||
| 56262 | ||||
| 56263 | // X86 can't encode an immediate LHS of a sub. See if we can push the | |||
| 56264 | // negation into a preceding instruction. If the RHS of the sub is a XOR with | |||
| 56265 | // one use and a constant, invert the immediate, saving one register. | |||
| 56266 | // sub(C1, xor(X, C2)) -> add(xor(X, ~C2), C1+1) | |||
| 56267 | if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) && | |||
| 56268 | IsNonOpaqueConstant(Op1.getOperand(1)) && Op1->hasOneUse()) { | |||
| 56269 | SDLoc DL(N); | |||
| 56270 | EVT VT = Op0.getValueType(); | |||
| 56271 | SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0), | |||
| 56272 | DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT)); | |||
| 56273 | SDValue NewAdd = | |||
| 56274 | DAG.getNode(ISD::ADD, DL, VT, Op0, DAG.getConstant(1, DL, VT)); | |||
| 56275 | return DAG.getNode(ISD::ADD, DL, VT, NewXor, NewAdd); | |||
| 56276 | } | |||
| 56277 | ||||
| 56278 | if (SDValue V = combineSubABS(N, DAG)) | |||
| 56279 | return V; | |||
| 56280 | ||||
| 56281 | // Try to synthesize horizontal subs from subs of shuffles. | |||
| 56282 | if (SDValue V = combineToHorizontalAddSub(N, DAG, Subtarget)) | |||
| 56283 | return V; | |||
| 56284 | ||||
| 56285 | // Fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W) | |||
| 56286 | if (Op1.getOpcode() == X86ISD::ADC && Op1->hasOneUse() && | |||
| 56287 | X86::isZeroNode(Op1.getOperand(1))) { | |||
| 56288 | assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) && "Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 56288, __extension__ __PRETTY_FUNCTION__)); | |||
| 56289 | return DAG.getNode(X86ISD::SBB, SDLoc(Op1), Op1->getVTList(), Op0, | |||
| 56290 | Op1.getOperand(0), Op1.getOperand(2)); | |||
| 56291 | } | |||
| 56292 | ||||
| 56293 | // Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y) | |||
| 56294 | // Don't fold to ADC(0,0,W)/SETCC_CARRY pattern which will prevent more folds. | |||
| 56295 | if (Op1.getOpcode() == X86ISD::SBB && Op1->hasOneUse() && | |||
| 56296 | !(X86::isZeroNode(Op0) && X86::isZeroNode(Op1.getOperand(1)))) { | |||
| 56297 | assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use")(static_cast <bool> (!Op1->hasAnyUseOfValue(1) && "Overflow bit in use") ? void (0) : __assert_fail ("!Op1->hasAnyUseOfValue(1) && \"Overflow bit in use\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 56297, __extension__ __PRETTY_FUNCTION__)); | |||
| 56298 | SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0, | |||
| 56299 | Op1.getOperand(1), Op1.getOperand(2)); | |||
| 56300 | return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0), | |||
| 56301 | Op1.getOperand(0)); | |||
| 56302 | } | |||
| 56303 | ||||
| 56304 | if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget)) | |||
| 56305 | return V; | |||
| 56306 | ||||
| 56307 | if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG)) | |||
| 56308 | return V; | |||
| 56309 | ||||
| 56310 | return combineSubSetcc(N, DAG); | |||
| 56311 | } | |||
| 56312 | ||||
| 56313 | static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, | |||
| 56314 | const X86Subtarget &Subtarget) { | |||
| 56315 | MVT VT = N->getSimpleValueType(0); | |||
| 56316 | SDLoc DL(N); | |||
| 56317 | ||||
| 56318 | if (N->getOperand(0) == N->getOperand(1)) { | |||
| 56319 | if (N->getOpcode() == X86ISD::PCMPEQ) | |||
| 56320 | return DAG.getConstant(-1, DL, VT); | |||
| 56321 | if (N->getOpcode() == X86ISD::PCMPGT) | |||
| 56322 | return DAG.getConstant(0, DL, VT); | |||
| 56323 | } | |||
| 56324 | ||||
| 56325 | return SDValue(); | |||
| 56326 | } | |||
| 56327 | ||||
| 56328 | /// Helper that combines an array of subvector ops as if they were the operands | |||
| 56329 | /// of a ISD::CONCAT_VECTORS node, but may have come from another source (e.g. | |||
| 56330 | /// ISD::INSERT_SUBVECTOR). The ops are assumed to be of the same type. | |||
| 56331 | static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, | |||
| 56332 | ArrayRef<SDValue> Ops, SelectionDAG &DAG, | |||
| 56333 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 56334 | const X86Subtarget &Subtarget) { | |||
| 56335 | assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors")(static_cast <bool> (Subtarget.hasAVX() && "AVX assumed for concat_vectors" ) ? void (0) : __assert_fail ("Subtarget.hasAVX() && \"AVX assumed for concat_vectors\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 56335, __extension__ __PRETTY_FUNCTION__)); | |||
| 56336 | unsigned EltSizeInBits = VT.getScalarSizeInBits(); | |||
| 56337 | ||||
| 56338 | if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); })) | |||
| 56339 | return DAG.getUNDEF(VT); | |||
| 56340 | ||||
| 56341 | if (llvm::all_of(Ops, [](SDValue Op) { | |||
| 56342 | return ISD::isBuildVectorAllZeros(Op.getNode()); | |||
| 56343 | })) | |||
| 56344 | return getZeroVector(VT, Subtarget, DAG, DL); | |||
| 56345 | ||||
| 56346 | SDValue Op0 = Ops[0]; | |||
| 56347 | bool IsSplat = llvm::all_equal(Ops); | |||
| 56348 | ||||
| 56349 | // Repeated subvectors. | |||
| 56350 | if (IsSplat && | |||
| 56351 | (VT.is256BitVector() || (VT.is512BitVector() && Subtarget.hasAVX512()))) { | |||
| 56352 | // If this broadcast is inserted into both halves, use a larger broadcast. | |||
| 56353 | if (Op0.getOpcode() == X86ISD::VBROADCAST) | |||
| 56354 | return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0)); | |||
| 56355 | ||||
| 56356 | // If this simple subvector or scalar/subvector broadcast_load is inserted | |||
| 56357 | // into both halves, use a larger broadcast_load. Update other uses to use | |||
| 56358 | // an extracted subvector. | |||
| 56359 | if (ISD::isNormalLoad(Op0.getNode()) || | |||
| 56360 | Op0.getOpcode() == X86ISD::VBROADCAST_LOAD || | |||
| 56361 | Op0.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) { | |||
| 56362 | auto *Mem = cast<MemSDNode>(Op0); | |||
| 56363 | unsigned Opc = Op0.getOpcode() == X86ISD::VBROADCAST_LOAD | |||
| 56364 | ? X86ISD::VBROADCAST_LOAD | |||
| 56365 | : X86ISD::SUBV_BROADCAST_LOAD; | |||
| 56366 | if (SDValue BcastLd = | |||
| 56367 | getBROADCAST_LOAD(Opc, DL, VT, Mem->getMemoryVT(), Mem, 0, DAG)) { | |||
| 56368 | SDValue BcastSrc = | |||
| 56369 | extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()); | |||
| 56370 | DAG.ReplaceAllUsesOfValueWith(Op0, BcastSrc); | |||
| 56371 | return BcastLd; | |||
| 56372 | } | |||
| 56373 | } | |||
| 56374 | ||||
| 56375 | // concat_vectors(movddup(x),movddup(x)) -> broadcast(x) | |||
| 56376 | if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 && | |||
| 56377 | (Subtarget.hasAVX2() || | |||
| 56378 | X86::mayFoldLoadIntoBroadcastFromMem(Op0.getOperand(0), | |||
| 56379 | VT.getScalarType(), Subtarget))) | |||
| 56380 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, | |||
| 56381 | DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f64, | |||
| 56382 | Op0.getOperand(0), | |||
| 56383 | DAG.getIntPtrConstant(0, DL))); | |||
| 56384 | ||||
| 56385 | // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) | |||
| 56386 | if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && | |||
| 56387 | (Subtarget.hasAVX2() || | |||
| 56388 | (EltSizeInBits >= 32 && | |||
| 56389 | X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) && | |||
| 56390 | Op0.getOperand(0).getValueType() == VT.getScalarType()) | |||
| 56391 | return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); | |||
| 56392 | ||||
| 56393 | // concat_vectors(extract_subvector(broadcast(x)), | |||
| 56394 | // extract_subvector(broadcast(x))) -> broadcast(x) | |||
| 56395 | if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 56396 | Op0.getOperand(0).getValueType() == VT) { | |||
| 56397 | if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST || | |||
| 56398 | Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD) | |||
| 56399 | return Op0.getOperand(0); | |||
| 56400 | } | |||
| 56401 | } | |||
| 56402 | ||||
| 56403 | // concat(extract_subvector(v0,c0), extract_subvector(v1,c1)) -> vperm2x128. | |||
| 56404 | // Only concat of subvector high halves which vperm2x128 is best at. | |||
| 56405 | // TODO: This should go in combineX86ShufflesRecursively eventually. | |||
| 56406 | if (VT.is256BitVector() && Ops.size() == 2) { | |||
| 56407 | SDValue Src0 = peekThroughBitcasts(Ops[0]); | |||
| 56408 | SDValue Src1 = peekThroughBitcasts(Ops[1]); | |||
| 56409 | if (Src0.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 56410 | Src1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { | |||
| 56411 | EVT SrcVT0 = Src0.getOperand(0).getValueType(); | |||
| 56412 | EVT SrcVT1 = Src1.getOperand(0).getValueType(); | |||
| 56413 | unsigned NumSrcElts0 = SrcVT0.getVectorNumElements(); | |||
| 56414 | unsigned NumSrcElts1 = SrcVT1.getVectorNumElements(); | |||
| 56415 | if (SrcVT0.is256BitVector() && SrcVT1.is256BitVector() && | |||
| 56416 | Src0.getConstantOperandAPInt(1) == (NumSrcElts0 / 2) && | |||
| 56417 | Src1.getConstantOperandAPInt(1) == (NumSrcElts1 / 2)) { | |||
| 56418 | return DAG.getNode(X86ISD::VPERM2X128, DL, VT, | |||
| 56419 | DAG.getBitcast(VT, Src0.getOperand(0)), | |||
| 56420 | DAG.getBitcast(VT, Src1.getOperand(0)), | |||
| 56421 | DAG.getTargetConstant(0x31, DL, MVT::i8)); | |||
| 56422 | } | |||
| 56423 | } | |||
| 56424 | } | |||
| 56425 | ||||
| 56426 | // Repeated opcode. | |||
| 56427 | // TODO - combineX86ShufflesRecursively should handle shuffle concatenation | |||
| 56428 | // but it currently struggles with different vector widths. | |||
| 56429 | if (llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56430 | return Op.getOpcode() == Op0.getOpcode(); | |||
| 56431 | })) { | |||
| 56432 | auto ConcatSubOperand = [&](EVT VT, ArrayRef<SDValue> SubOps, unsigned I) { | |||
| 56433 | SmallVector<SDValue> Subs; | |||
| 56434 | for (SDValue SubOp : SubOps) | |||
| 56435 | Subs.push_back(SubOp.getOperand(I)); | |||
| 56436 | return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Subs); | |||
| 56437 | }; | |||
| 56438 | auto IsConcatFree = [](MVT VT, ArrayRef<SDValue> SubOps, unsigned Op) { | |||
| 56439 | for (unsigned I = 0, E = SubOps.size(); I != E; ++I) { | |||
| 56440 | SDValue Sub = SubOps[I].getOperand(Op); | |||
| 56441 | unsigned NumSubElts = Sub.getValueType().getVectorNumElements(); | |||
| 56442 | if (Sub.getOpcode() != ISD::EXTRACT_SUBVECTOR || | |||
| 56443 | Sub.getOperand(0).getValueType() != VT || | |||
| 56444 | Sub.getConstantOperandAPInt(1) != (I * NumSubElts)) | |||
| 56445 | return false; | |||
| 56446 | } | |||
| 56447 | return true; | |||
| 56448 | }; | |||
| 56449 | ||||
| 56450 | unsigned NumOps = Ops.size(); | |||
| 56451 | switch (Op0.getOpcode()) { | |||
| 56452 | case X86ISD::VBROADCAST: { | |||
| 56453 | if (!IsSplat && llvm::all_of(Ops, [](SDValue Op) { | |||
| 56454 | return Op.getOperand(0).getValueType().is128BitVector(); | |||
| 56455 | })) { | |||
| 56456 | if (VT == MVT::v4f64 || VT == MVT::v4i64) | |||
| 56457 | return DAG.getNode(X86ISD::UNPCKL, DL, VT, | |||
| 56458 | ConcatSubOperand(VT, Ops, 0), | |||
| 56459 | ConcatSubOperand(VT, Ops, 0)); | |||
| 56460 | // TODO: Add pseudo v8i32 PSHUFD handling to AVX1Only targets. | |||
| 56461 | if (VT == MVT::v8f32 || (VT == MVT::v8i32 && Subtarget.hasInt256())) | |||
| 56462 | return DAG.getNode(VT == MVT::v8f32 ? X86ISD::VPERMILPI | |||
| 56463 | : X86ISD::PSHUFD, | |||
| 56464 | DL, VT, ConcatSubOperand(VT, Ops, 0), | |||
| 56465 | getV4X86ShuffleImm8ForMask({0, 0, 0, 0}, DL, DAG)); | |||
| 56466 | } | |||
| 56467 | break; | |||
| 56468 | } | |||
| 56469 | case X86ISD::MOVDDUP: | |||
| 56470 | case X86ISD::MOVSHDUP: | |||
| 56471 | case X86ISD::MOVSLDUP: { | |||
| 56472 | if (!IsSplat) | |||
| 56473 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56474 | ConcatSubOperand(VT, Ops, 0)); | |||
| 56475 | break; | |||
| 56476 | } | |||
| 56477 | case X86ISD::SHUFP: { | |||
| 56478 | // Add SHUFPD support if/when necessary. | |||
| 56479 | if (!IsSplat && VT.getScalarType() == MVT::f32 && | |||
| 56480 | llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56481 | return Op.getOperand(2) == Op0.getOperand(2); | |||
| 56482 | })) { | |||
| 56483 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56484 | ConcatSubOperand(VT, Ops, 0), | |||
| 56485 | ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); | |||
| 56486 | } | |||
| 56487 | break; | |||
| 56488 | } | |||
| 56489 | case X86ISD::PSHUFHW: | |||
| 56490 | case X86ISD::PSHUFLW: | |||
| 56491 | case X86ISD::PSHUFD: | |||
| 56492 | if (!IsSplat && NumOps == 2 && VT.is256BitVector() && | |||
| 56493 | Subtarget.hasInt256() && Op0.getOperand(1) == Ops[1].getOperand(1)) { | |||
| 56494 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56495 | ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); | |||
| 56496 | } | |||
| 56497 | [[fallthrough]]; | |||
| 56498 | case X86ISD::VPERMILPI: | |||
| 56499 | if (!IsSplat && VT.getScalarSizeInBits() == 32 && | |||
| 56500 | (VT.is256BitVector() || | |||
| 56501 | (VT.is512BitVector() && Subtarget.useAVX512Regs())) && | |||
| 56502 | all_of(Ops, [&Op0](SDValue Op) { | |||
| 56503 | return Op0.getOperand(1) == Op.getOperand(1); | |||
| 56504 | })) { | |||
| 56505 | MVT FloatVT = VT.changeVectorElementType(MVT::f32); | |||
| 56506 | SDValue Res = DAG.getBitcast(FloatVT, ConcatSubOperand(VT, Ops, 0)); | |||
| 56507 | Res = | |||
| 56508 | DAG.getNode(X86ISD::VPERMILPI, DL, FloatVT, Res, Op0.getOperand(1)); | |||
| 56509 | return DAG.getBitcast(VT, Res); | |||
| 56510 | } | |||
| 56511 | if (!IsSplat && NumOps == 2 && VT == MVT::v4f64) { | |||
| 56512 | uint64_t Idx0 = Ops[0].getConstantOperandVal(1); | |||
| 56513 | uint64_t Idx1 = Ops[1].getConstantOperandVal(1); | |||
| 56514 | uint64_t Idx = ((Idx1 & 3) << 2) | (Idx0 & 3); | |||
| 56515 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56516 | ConcatSubOperand(VT, Ops, 0), | |||
| 56517 | DAG.getTargetConstant(Idx, DL, MVT::i8)); | |||
| 56518 | } | |||
| 56519 | break; | |||
| 56520 | case X86ISD::PSHUFB: | |||
| 56521 | if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || | |||
| 56522 | (VT.is512BitVector() && Subtarget.useBWIRegs()))) { | |||
| 56523 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56524 | ConcatSubOperand(VT, Ops, 0), | |||
| 56525 | ConcatSubOperand(VT, Ops, 1)); | |||
| 56526 | } | |||
| 56527 | break; | |||
| 56528 | case X86ISD::VPERMV: | |||
| 56529 | if (!IsSplat && NumOps == 2 && | |||
| 56530 | (VT.is512BitVector() && Subtarget.useAVX512Regs())) { | |||
| 56531 | MVT OpVT = Op0.getSimpleValueType(); | |||
| 56532 | int NumSrcElts = OpVT.getVectorNumElements(); | |||
| 56533 | SmallVector<int, 64> ConcatMask; | |||
| 56534 | for (unsigned i = 0; i != NumOps; ++i) { | |||
| 56535 | SmallVector<int, 64> SubMask; | |||
| 56536 | SmallVector<SDValue, 2> SubOps; | |||
| 56537 | if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps, | |||
| 56538 | SubMask)) | |||
| 56539 | break; | |||
| 56540 | for (int M : SubMask) { | |||
| 56541 | if (0 <= M) | |||
| 56542 | M += i * NumSrcElts; | |||
| 56543 | ConcatMask.push_back(M); | |||
| 56544 | } | |||
| 56545 | } | |||
| 56546 | if (ConcatMask.size() == (NumOps * NumSrcElts)) { | |||
| 56547 | SDValue Src = concatSubVectors(Ops[0].getOperand(1), | |||
| 56548 | Ops[1].getOperand(1), DAG, DL); | |||
| 56549 | MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits); | |||
| 56550 | MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); | |||
| 56551 | SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); | |||
| 56552 | return DAG.getNode(X86ISD::VPERMV, DL, VT, Mask, Src); | |||
| 56553 | } | |||
| 56554 | } | |||
| 56555 | break; | |||
| 56556 | case X86ISD::VPERMV3: | |||
| 56557 | if (!IsSplat && NumOps == 2 && VT.is512BitVector()) { | |||
| 56558 | MVT OpVT = Op0.getSimpleValueType(); | |||
| 56559 | int NumSrcElts = OpVT.getVectorNumElements(); | |||
| 56560 | SmallVector<int, 64> ConcatMask; | |||
| 56561 | for (unsigned i = 0; i != NumOps; ++i) { | |||
| 56562 | SmallVector<int, 64> SubMask; | |||
| 56563 | SmallVector<SDValue, 2> SubOps; | |||
| 56564 | if (!getTargetShuffleMask(Ops[i].getNode(), OpVT, false, SubOps, | |||
| 56565 | SubMask)) | |||
| 56566 | break; | |||
| 56567 | for (int M : SubMask) { | |||
| 56568 | if (0 <= M) { | |||
| 56569 | M += M < NumSrcElts ? 0 : NumSrcElts; | |||
| 56570 | M += i * NumSrcElts; | |||
| 56571 | } | |||
| 56572 | ConcatMask.push_back(M); | |||
| 56573 | } | |||
| 56574 | } | |||
| 56575 | if (ConcatMask.size() == (NumOps * NumSrcElts)) { | |||
| 56576 | SDValue Src0 = concatSubVectors(Ops[0].getOperand(0), | |||
| 56577 | Ops[1].getOperand(0), DAG, DL); | |||
| 56578 | SDValue Src1 = concatSubVectors(Ops[0].getOperand(2), | |||
| 56579 | Ops[1].getOperand(2), DAG, DL); | |||
| 56580 | MVT IntMaskSVT = MVT::getIntegerVT(EltSizeInBits); | |||
| 56581 | MVT IntMaskVT = MVT::getVectorVT(IntMaskSVT, NumOps * NumSrcElts); | |||
| 56582 | SDValue Mask = getConstVector(ConcatMask, IntMaskVT, DAG, DL, true); | |||
| 56583 | return DAG.getNode(X86ISD::VPERMV3, DL, VT, Src0, Mask, Src1); | |||
| 56584 | } | |||
| 56585 | } | |||
| 56586 | break; | |||
| 56587 | case ISD::TRUNCATE: | |||
| 56588 | if (!IsSplat && NumOps == 2 && VT.is256BitVector()) { | |||
| 56589 | EVT SrcVT = Ops[0].getOperand(0).getValueType(); | |||
| 56590 | if (SrcVT.is256BitVector() && SrcVT.isSimple() && | |||
| 56591 | SrcVT == Ops[1].getOperand(0).getValueType() && | |||
| 56592 | Subtarget.useAVX512Regs() && | |||
| 56593 | Subtarget.getPreferVectorWidth() >= 512 && | |||
| 56594 | (SrcVT.getScalarSizeInBits() > 16 || Subtarget.useBWIRegs())) { | |||
| 56595 | EVT NewSrcVT = SrcVT.getDoubleNumVectorElementsVT(*DAG.getContext()); | |||
| 56596 | return DAG.getNode(ISD::TRUNCATE, DL, VT, | |||
| 56597 | ConcatSubOperand(NewSrcVT, Ops, 0)); | |||
| 56598 | } | |||
| 56599 | } | |||
| 56600 | break; | |||
| 56601 | case X86ISD::VSHLI: | |||
| 56602 | case X86ISD::VSRLI: | |||
| 56603 | // Special case: SHL/SRL AVX1 V4i64 by 32-bits can lower as a shuffle. | |||
| 56604 | // TODO: Move this to LowerShiftByScalarImmediate? | |||
| 56605 | if (VT == MVT::v4i64 && !Subtarget.hasInt256() && | |||
| 56606 | llvm::all_of(Ops, [](SDValue Op) { | |||
| 56607 | return Op.getConstantOperandAPInt(1) == 32; | |||
| 56608 | })) { | |||
| 56609 | SDValue Res = DAG.getBitcast(MVT::v8i32, ConcatSubOperand(VT, Ops, 0)); | |||
| 56610 | SDValue Zero = getZeroVector(MVT::v8i32, Subtarget, DAG, DL); | |||
| 56611 | if (Op0.getOpcode() == X86ISD::VSHLI) { | |||
| 56612 | Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, | |||
| 56613 | {8, 0, 8, 2, 8, 4, 8, 6}); | |||
| 56614 | } else { | |||
| 56615 | Res = DAG.getVectorShuffle(MVT::v8i32, DL, Res, Zero, | |||
| 56616 | {1, 8, 3, 8, 5, 8, 7, 8}); | |||
| 56617 | } | |||
| 56618 | return DAG.getBitcast(VT, Res); | |||
| 56619 | } | |||
| 56620 | [[fallthrough]]; | |||
| 56621 | case X86ISD::VSRAI: | |||
| 56622 | case X86ISD::VSHL: | |||
| 56623 | case X86ISD::VSRL: | |||
| 56624 | case X86ISD::VSRA: | |||
| 56625 | if (((VT.is256BitVector() && Subtarget.hasInt256()) || | |||
| 56626 | (VT.is512BitVector() && Subtarget.useAVX512Regs() && | |||
| 56627 | (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) && | |||
| 56628 | llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56629 | return Op0.getOperand(1) == Op.getOperand(1); | |||
| 56630 | })) { | |||
| 56631 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56632 | ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); | |||
| 56633 | } | |||
| 56634 | break; | |||
| 56635 | case X86ISD::VPERMI: | |||
| 56636 | case X86ISD::VROTLI: | |||
| 56637 | case X86ISD::VROTRI: | |||
| 56638 | if (VT.is512BitVector() && Subtarget.useAVX512Regs() && | |||
| 56639 | llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56640 | return Op0.getOperand(1) == Op.getOperand(1); | |||
| 56641 | })) { | |||
| 56642 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56643 | ConcatSubOperand(VT, Ops, 0), Op0.getOperand(1)); | |||
| 56644 | } | |||
| 56645 | break; | |||
| 56646 | case ISD::AND: | |||
| 56647 | case ISD::OR: | |||
| 56648 | case ISD::XOR: | |||
| 56649 | case X86ISD::ANDNP: | |||
| 56650 | if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || | |||
| 56651 | (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { | |||
| 56652 | MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); | |||
| 56653 | SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), | |||
| 56654 | NumOps * SrcVT.getVectorNumElements()); | |||
| 56655 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56656 | ConcatSubOperand(SrcVT, Ops, 0), | |||
| 56657 | ConcatSubOperand(SrcVT, Ops, 1)); | |||
| 56658 | } | |||
| 56659 | break; | |||
| 56660 | case X86ISD::GF2P8AFFINEQB: | |||
| 56661 | if (!IsSplat && | |||
| 56662 | (VT.is256BitVector() || | |||
| 56663 | (VT.is512BitVector() && Subtarget.useAVX512Regs())) && | |||
| 56664 | llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56665 | return Op0.getOperand(2) == Op.getOperand(2); | |||
| 56666 | })) { | |||
| 56667 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56668 | ConcatSubOperand(VT, Ops, 0), | |||
| 56669 | ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); | |||
| 56670 | } | |||
| 56671 | break; | |||
| 56672 | case ISD::ADD: | |||
| 56673 | case ISD::SUB: | |||
| 56674 | case ISD::MUL: | |||
| 56675 | if (!IsSplat && ((VT.is256BitVector() && Subtarget.hasInt256()) || | |||
| 56676 | (VT.is512BitVector() && Subtarget.useAVX512Regs() && | |||
| 56677 | (EltSizeInBits >= 32 || Subtarget.useBWIRegs())))) { | |||
| 56678 | MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); | |||
| 56679 | SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), | |||
| 56680 | NumOps * SrcVT.getVectorNumElements()); | |||
| 56681 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56682 | ConcatSubOperand(SrcVT, Ops, 0), | |||
| 56683 | ConcatSubOperand(SrcVT, Ops, 1)); | |||
| 56684 | } | |||
| 56685 | break; | |||
| 56686 | // Due to VADD, VSUB, VMUL can executed on more ports than VINSERT and | |||
| 56687 | // their latency are short, so here we don't replace them. | |||
| 56688 | case ISD::FDIV: | |||
| 56689 | if (!IsSplat && (VT.is256BitVector() || | |||
| 56690 | (VT.is512BitVector() && Subtarget.useAVX512Regs()))) { | |||
| 56691 | MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); | |||
| 56692 | SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), | |||
| 56693 | NumOps * SrcVT.getVectorNumElements()); | |||
| 56694 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56695 | ConcatSubOperand(SrcVT, Ops, 0), | |||
| 56696 | ConcatSubOperand(SrcVT, Ops, 1)); | |||
| 56697 | } | |||
| 56698 | break; | |||
| 56699 | case X86ISD::HADD: | |||
| 56700 | case X86ISD::HSUB: | |||
| 56701 | case X86ISD::FHADD: | |||
| 56702 | case X86ISD::FHSUB: | |||
| 56703 | case X86ISD::PACKSS: | |||
| 56704 | case X86ISD::PACKUS: | |||
| 56705 | if (!IsSplat && VT.is256BitVector() && | |||
| 56706 | (VT.isFloatingPoint() || Subtarget.hasInt256())) { | |||
| 56707 | MVT SrcVT = Op0.getOperand(0).getSimpleValueType(); | |||
| 56708 | SrcVT = MVT::getVectorVT(SrcVT.getScalarType(), | |||
| 56709 | NumOps * SrcVT.getVectorNumElements()); | |||
| 56710 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56711 | ConcatSubOperand(SrcVT, Ops, 0), | |||
| 56712 | ConcatSubOperand(SrcVT, Ops, 1)); | |||
| 56713 | } | |||
| 56714 | break; | |||
| 56715 | case X86ISD::PALIGNR: | |||
| 56716 | if (!IsSplat && | |||
| 56717 | ((VT.is256BitVector() && Subtarget.hasInt256()) || | |||
| 56718 | (VT.is512BitVector() && Subtarget.useBWIRegs())) && | |||
| 56719 | llvm::all_of(Ops, [Op0](SDValue Op) { | |||
| 56720 | return Op0.getOperand(2) == Op.getOperand(2); | |||
| 56721 | })) { | |||
| 56722 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56723 | ConcatSubOperand(VT, Ops, 0), | |||
| 56724 | ConcatSubOperand(VT, Ops, 1), Op0.getOperand(2)); | |||
| 56725 | } | |||
| 56726 | break; | |||
| 56727 | case ISD::VSELECT: | |||
| 56728 | if (!IsSplat && Subtarget.hasAVX512() && | |||
| 56729 | (VT.is256BitVector() || | |||
| 56730 | (VT.is512BitVector() && Subtarget.useAVX512Regs())) && | |||
| 56731 | (EltSizeInBits >= 32 || Subtarget.hasBWI())) { | |||
| 56732 | EVT SelVT = Ops[0].getOperand(0).getValueType(); | |||
| 56733 | if (SelVT.getVectorElementType() == MVT::i1) { | |||
| 56734 | SelVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, | |||
| 56735 | Ops.size() * SelVT.getVectorNumElements()); | |||
| 56736 | if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT)) | |||
| 56737 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56738 | ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), | |||
| 56739 | ConcatSubOperand(VT, Ops, 1), | |||
| 56740 | ConcatSubOperand(VT, Ops, 2)); | |||
| 56741 | } | |||
| 56742 | } | |||
| 56743 | [[fallthrough]]; | |||
| 56744 | case X86ISD::BLENDV: | |||
| 56745 | if (!IsSplat && VT.is256BitVector() && Ops.size() == 2 && | |||
| 56746 | (EltSizeInBits >= 32 || Subtarget.hasInt256()) && | |||
| 56747 | IsConcatFree(VT, Ops, 1) && IsConcatFree(VT, Ops, 2)) { | |||
| 56748 | EVT SelVT = Ops[0].getOperand(0).getValueType(); | |||
| 56749 | SelVT = SelVT.getDoubleNumVectorElementsVT(*DAG.getContext()); | |||
| 56750 | if (DAG.getTargetLoweringInfo().isTypeLegal(SelVT)) | |||
| 56751 | return DAG.getNode(Op0.getOpcode(), DL, VT, | |||
| 56752 | ConcatSubOperand(SelVT.getSimpleVT(), Ops, 0), | |||
| 56753 | ConcatSubOperand(VT, Ops, 1), | |||
| 56754 | ConcatSubOperand(VT, Ops, 2)); | |||
| 56755 | } | |||
| 56756 | break; | |||
| 56757 | } | |||
| 56758 | } | |||
| 56759 | ||||
| 56760 | // Fold subvector loads into one. | |||
| 56761 | // If needed, look through bitcasts to get to the load. | |||
| 56762 | if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(Op0))) { | |||
| 56763 | unsigned Fast; | |||
| 56764 | const X86TargetLowering *TLI = Subtarget.getTargetLowering(); | |||
| 56765 | if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT, | |||
| 56766 | *FirstLd->getMemOperand(), &Fast) && | |||
| 56767 | Fast) { | |||
| 56768 | if (SDValue Ld = | |||
| 56769 | EltsFromConsecutiveLoads(VT, Ops, DL, DAG, Subtarget, false)) | |||
| 56770 | return Ld; | |||
| 56771 | } | |||
| 56772 | } | |||
| 56773 | ||||
| 56774 | // Attempt to fold target constant loads. | |||
| 56775 | if (all_of(Ops, [](SDValue Op) { return getTargetConstantFromNode(Op); })) { | |||
| 56776 | SmallVector<APInt> EltBits; | |||
| 56777 | APInt UndefElts = APInt::getZero(VT.getVectorNumElements()); | |||
| 56778 | for (unsigned I = 0, E = Ops.size(); I != E; ++I) { | |||
| 56779 | APInt OpUndefElts; | |||
| 56780 | SmallVector<APInt> OpEltBits; | |||
| 56781 | if (!getTargetConstantBitsFromNode(Ops[I], EltSizeInBits, OpUndefElts, | |||
| 56782 | OpEltBits, true, false)) | |||
| 56783 | break; | |||
| 56784 | EltBits.append(OpEltBits); | |||
| 56785 | UndefElts.insertBits(OpUndefElts, I * OpUndefElts.getBitWidth()); | |||
| 56786 | } | |||
| 56787 | if (EltBits.size() == VT.getVectorNumElements()) | |||
| 56788 | return getConstVector(EltBits, UndefElts, VT, DAG, DL); | |||
| 56789 | } | |||
| 56790 | ||||
| 56791 | return SDValue(); | |||
| 56792 | } | |||
| 56793 | ||||
| 56794 | static SDValue combineCONCAT_VECTORS(SDNode *N, SelectionDAG &DAG, | |||
| 56795 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 56796 | const X86Subtarget &Subtarget) { | |||
| 56797 | EVT VT = N->getValueType(0); | |||
| 56798 | EVT SrcVT = N->getOperand(0).getValueType(); | |||
| 56799 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 56800 | SmallVector<SDValue, 4> Ops(N->op_begin(), N->op_end()); | |||
| 56801 | ||||
| 56802 | if (VT.getVectorElementType() == MVT::i1) { | |||
| 56803 | // Attempt to constant fold. | |||
| 56804 | unsigned SubSizeInBits = SrcVT.getSizeInBits(); | |||
| 56805 | APInt Constant = APInt::getZero(VT.getSizeInBits()); | |||
| 56806 | for (unsigned I = 0, E = Ops.size(); I != E; ++I) { | |||
| 56807 | auto *C = dyn_cast<ConstantSDNode>(peekThroughBitcasts(Ops[I])); | |||
| 56808 | if (!C) break; | |||
| 56809 | Constant.insertBits(C->getAPIntValue(), I * SubSizeInBits); | |||
| 56810 | if (I == (E - 1)) { | |||
| 56811 | EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits()); | |||
| 56812 | if (TLI.isTypeLegal(IntVT)) | |||
| 56813 | return DAG.getBitcast(VT, DAG.getConstant(Constant, SDLoc(N), IntVT)); | |||
| 56814 | } | |||
| 56815 | } | |||
| 56816 | ||||
| 56817 | // Don't do anything else for i1 vectors. | |||
| 56818 | return SDValue(); | |||
| 56819 | } | |||
| 56820 | ||||
| 56821 | if (Subtarget.hasAVX() && TLI.isTypeLegal(VT) && TLI.isTypeLegal(SrcVT)) { | |||
| 56822 | if (SDValue R = combineConcatVectorOps(SDLoc(N), VT.getSimpleVT(), Ops, DAG, | |||
| 56823 | DCI, Subtarget)) | |||
| 56824 | return R; | |||
| 56825 | } | |||
| 56826 | ||||
| 56827 | return SDValue(); | |||
| 56828 | } | |||
| 56829 | ||||
| 56830 | static SDValue combineINSERT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, | |||
| 56831 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 56832 | const X86Subtarget &Subtarget) { | |||
| 56833 | if (DCI.isBeforeLegalizeOps()) | |||
| 56834 | return SDValue(); | |||
| 56835 | ||||
| 56836 | MVT OpVT = N->getSimpleValueType(0); | |||
| 56837 | ||||
| 56838 | bool IsI1Vector = OpVT.getVectorElementType() == MVT::i1; | |||
| 56839 | ||||
| 56840 | SDLoc dl(N); | |||
| 56841 | SDValue Vec = N->getOperand(0); | |||
| 56842 | SDValue SubVec = N->getOperand(1); | |||
| 56843 | ||||
| 56844 | uint64_t IdxVal = N->getConstantOperandVal(2); | |||
| 56845 | MVT SubVecVT = SubVec.getSimpleValueType(); | |||
| 56846 | ||||
| 56847 | if (Vec.isUndef() && SubVec.isUndef()) | |||
| 56848 | return DAG.getUNDEF(OpVT); | |||
| 56849 | ||||
| 56850 | // Inserting undefs/zeros into zeros/undefs is a zero vector. | |||
| 56851 | if ((Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())) && | |||
| 56852 | (SubVec.isUndef() || ISD::isBuildVectorAllZeros(SubVec.getNode()))) | |||
| 56853 | return getZeroVector(OpVT, Subtarget, DAG, dl); | |||
| 56854 | ||||
| 56855 | if (ISD::isBuildVectorAllZeros(Vec.getNode())) { | |||
| 56856 | // If we're inserting into a zero vector and then into a larger zero vector, | |||
| 56857 | // just insert into the larger zero vector directly. | |||
| 56858 | if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && | |||
| 56859 | ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { | |||
| 56860 | uint64_t Idx2Val = SubVec.getConstantOperandVal(2); | |||
| 56861 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, | |||
| 56862 | getZeroVector(OpVT, Subtarget, DAG, dl), | |||
| 56863 | SubVec.getOperand(1), | |||
| 56864 | DAG.getIntPtrConstant(IdxVal + Idx2Val, dl)); | |||
| 56865 | } | |||
| 56866 | ||||
| 56867 | // If we're inserting into a zero vector and our input was extracted from an | |||
| 56868 | // insert into a zero vector of the same type and the extraction was at | |||
| 56869 | // least as large as the original insertion. Just insert the original | |||
| 56870 | // subvector into a zero vector. | |||
| 56871 | if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && IdxVal == 0 && | |||
| 56872 | isNullConstant(SubVec.getOperand(1)) && | |||
| 56873 | SubVec.getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR) { | |||
| 56874 | SDValue Ins = SubVec.getOperand(0); | |||
| 56875 | if (isNullConstant(Ins.getOperand(2)) && | |||
| 56876 | ISD::isBuildVectorAllZeros(Ins.getOperand(0).getNode()) && | |||
| 56877 | Ins.getOperand(1).getValueSizeInBits().getFixedValue() <= | |||
| 56878 | SubVecVT.getFixedSizeInBits()) | |||
| 56879 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, | |||
| 56880 | getZeroVector(OpVT, Subtarget, DAG, dl), | |||
| 56881 | Ins.getOperand(1), N->getOperand(2)); | |||
| 56882 | } | |||
| 56883 | } | |||
| 56884 | ||||
| 56885 | // Stop here if this is an i1 vector. | |||
| 56886 | if (IsI1Vector) | |||
| 56887 | return SDValue(); | |||
| 56888 | ||||
| 56889 | // Eliminate an intermediate vector widening: | |||
| 56890 | // insert_subvector X, (insert_subvector undef, Y, 0), Idx --> | |||
| 56891 | // insert_subvector X, Y, Idx | |||
| 56892 | // TODO: This is a more general version of a DAGCombiner fold, can we move it | |||
| 56893 | // there? | |||
| 56894 | if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && | |||
| 56895 | SubVec.getOperand(0).isUndef() && isNullConstant(SubVec.getOperand(2))) | |||
| 56896 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, | |||
| 56897 | SubVec.getOperand(1), N->getOperand(2)); | |||
| 56898 | ||||
| 56899 | // If this is an insert of an extract, combine to a shuffle. Don't do this | |||
| 56900 | // if the insert or extract can be represented with a subregister operation. | |||
| 56901 | if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && | |||
| 56902 | SubVec.getOperand(0).getSimpleValueType() == OpVT && | |||
| 56903 | (IdxVal != 0 || | |||
| 56904 | !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) { | |||
| 56905 | int ExtIdxVal = SubVec.getConstantOperandVal(1); | |||
| 56906 | if (ExtIdxVal != 0) { | |||
| 56907 | int VecNumElts = OpVT.getVectorNumElements(); | |||
| 56908 | int SubVecNumElts = SubVecVT.getVectorNumElements(); | |||
| 56909 | SmallVector<int, 64> Mask(VecNumElts); | |||
| 56910 | // First create an identity shuffle mask. | |||
| 56911 | for (int i = 0; i != VecNumElts; ++i) | |||
| 56912 | Mask[i] = i; | |||
| 56913 | // Now insert the extracted portion. | |||
| 56914 | for (int i = 0; i != SubVecNumElts; ++i) | |||
| 56915 | Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts; | |||
| 56916 | ||||
| 56917 | return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask); | |||
| 56918 | } | |||
| 56919 | } | |||
| 56920 | ||||
| 56921 | // Match concat_vector style patterns. | |||
| 56922 | SmallVector<SDValue, 2> SubVectorOps; | |||
| 56923 | if (collectConcatOps(N, SubVectorOps, DAG)) { | |||
| 56924 | if (SDValue Fold = | |||
| 56925 | combineConcatVectorOps(dl, OpVT, SubVectorOps, DAG, DCI, Subtarget)) | |||
| 56926 | return Fold; | |||
| 56927 | ||||
| 56928 | // If we're inserting all zeros into the upper half, change this to | |||
| 56929 | // a concat with zero. We will match this to a move | |||
| 56930 | // with implicit upper bit zeroing during isel. | |||
| 56931 | // We do this here because we don't want combineConcatVectorOps to | |||
| 56932 | // create INSERT_SUBVECTOR from CONCAT_VECTORS. | |||
| 56933 | if (SubVectorOps.size() == 2 && | |||
| 56934 | ISD::isBuildVectorAllZeros(SubVectorOps[1].getNode())) | |||
| 56935 | return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, | |||
| 56936 | getZeroVector(OpVT, Subtarget, DAG, dl), | |||
| 56937 | SubVectorOps[0], DAG.getIntPtrConstant(0, dl)); | |||
| 56938 | } | |||
| 56939 | ||||
| 56940 | // If this is a broadcast insert into an upper undef, use a larger broadcast. | |||
| 56941 | if (Vec.isUndef() && IdxVal != 0 && SubVec.getOpcode() == X86ISD::VBROADCAST) | |||
| 56942 | return DAG.getNode(X86ISD::VBROADCAST, dl, OpVT, SubVec.getOperand(0)); | |||
| 56943 | ||||
| 56944 | // If this is a broadcast load inserted into an upper undef, use a larger | |||
| 56945 | // broadcast load. | |||
| 56946 | if (Vec.isUndef() && IdxVal != 0 && SubVec.hasOneUse() && | |||
| 56947 | SubVec.getOpcode() == X86ISD::VBROADCAST_LOAD) { | |||
| 56948 | auto *MemIntr = cast<MemIntrinsicSDNode>(SubVec); | |||
| 56949 | SDVTList Tys = DAG.getVTList(OpVT, MVT::Other); | |||
| 56950 | SDValue Ops[] = { MemIntr->getChain(), MemIntr->getBasePtr() }; | |||
| 56951 | SDValue BcastLd = | |||
| 56952 | DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, | |||
| 56953 | MemIntr->getMemoryVT(), | |||
| 56954 | MemIntr->getMemOperand()); | |||
| 56955 | DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1)); | |||
| 56956 | return BcastLd; | |||
| 56957 | } | |||
| 56958 | ||||
| 56959 | // If we're splatting the lower half subvector of a full vector load into the | |||
| 56960 | // upper half, attempt to create a subvector broadcast. | |||
| 56961 | if (IdxVal == (OpVT.getVectorNumElements() / 2) && SubVec.hasOneUse() && | |||
| 56962 | Vec.getValueSizeInBits() == (2 * SubVec.getValueSizeInBits())) { | |||
| 56963 | auto *VecLd = dyn_cast<LoadSDNode>(Vec); | |||
| 56964 | auto *SubLd = dyn_cast<LoadSDNode>(SubVec); | |||
| 56965 | if (VecLd && SubLd && | |||
| 56966 | DAG.areNonVolatileConsecutiveLoads(SubLd, VecLd, | |||
| 56967 | SubVec.getValueSizeInBits() / 8, 0)) | |||
| 56968 | return getBROADCAST_LOAD(X86ISD::SUBV_BROADCAST_LOAD, dl, OpVT, SubVecVT, | |||
| 56969 | SubLd, 0, DAG); | |||
| 56970 | } | |||
| 56971 | ||||
| 56972 | return SDValue(); | |||
| 56973 | } | |||
| 56974 | ||||
| 56975 | /// If we are extracting a subvector of a vector select and the select condition | |||
| 56976 | /// is composed of concatenated vectors, try to narrow the select width. This | |||
| 56977 | /// is a common pattern for AVX1 integer code because 256-bit selects may be | |||
| 56978 | /// legal, but there is almost no integer math/logic available for 256-bit. | |||
| 56979 | /// This function should only be called with legal types (otherwise, the calls | |||
| 56980 | /// to get simple value types will assert). | |||
| 56981 | static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) { | |||
| 56982 | SDValue Sel = Ext->getOperand(0); | |||
| 56983 | SmallVector<SDValue, 4> CatOps; | |||
| 56984 | if (Sel.getOpcode() != ISD::VSELECT || | |||
| 56985 | !collectConcatOps(Sel.getOperand(0).getNode(), CatOps, DAG)) | |||
| 56986 | return SDValue(); | |||
| 56987 | ||||
| 56988 | // Note: We assume simple value types because this should only be called with | |||
| 56989 | // legal operations/types. | |||
| 56990 | // TODO: This can be extended to handle extraction to 256-bits. | |||
| 56991 | MVT VT = Ext->getSimpleValueType(0); | |||
| 56992 | if (!VT.is128BitVector()) | |||
| 56993 | return SDValue(); | |||
| 56994 | ||||
| 56995 | MVT SelCondVT = Sel.getOperand(0).getSimpleValueType(); | |||
| 56996 | if (!SelCondVT.is256BitVector() && !SelCondVT.is512BitVector()) | |||
| 56997 | return SDValue(); | |||
| 56998 | ||||
| 56999 | MVT WideVT = Ext->getOperand(0).getSimpleValueType(); | |||
| 57000 | MVT SelVT = Sel.getSimpleValueType(); | |||
| 57001 | assert((SelVT.is256BitVector() || SelVT.is512BitVector()) &&(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector ()) && "Unexpected vector type with legal operations" ) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__ __PRETTY_FUNCTION__)) | |||
| 57002 | "Unexpected vector type with legal operations")(static_cast <bool> ((SelVT.is256BitVector() || SelVT.is512BitVector ()) && "Unexpected vector type with legal operations" ) ? void (0) : __assert_fail ("(SelVT.is256BitVector() || SelVT.is512BitVector()) && \"Unexpected vector type with legal operations\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57002, __extension__ __PRETTY_FUNCTION__)); | |||
| 57003 | ||||
| 57004 | unsigned SelElts = SelVT.getVectorNumElements(); | |||
| 57005 | unsigned CastedElts = WideVT.getVectorNumElements(); | |||
| 57006 | unsigned ExtIdx = Ext->getConstantOperandVal(1); | |||
| 57007 | if (SelElts % CastedElts == 0) { | |||
| 57008 | // The select has the same or more (narrower) elements than the extract | |||
| 57009 | // operand. The extraction index gets scaled by that factor. | |||
| 57010 | ExtIdx *= (SelElts / CastedElts); | |||
| 57011 | } else if (CastedElts % SelElts == 0) { | |||
| 57012 | // The select has less (wider) elements than the extract operand. Make sure | |||
| 57013 | // that the extraction index can be divided evenly. | |||
| 57014 | unsigned IndexDivisor = CastedElts / SelElts; | |||
| 57015 | if (ExtIdx % IndexDivisor != 0) | |||
| 57016 | return SDValue(); | |||
| 57017 | ExtIdx /= IndexDivisor; | |||
| 57018 | } else { | |||
| 57019 | llvm_unreachable("Element count of simple vector types are not divisible?")::llvm::llvm_unreachable_internal("Element count of simple vector types are not divisible?" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57019); | |||
| 57020 | } | |||
| 57021 | ||||
| 57022 | unsigned NarrowingFactor = WideVT.getSizeInBits() / VT.getSizeInBits(); | |||
| 57023 | unsigned NarrowElts = SelElts / NarrowingFactor; | |||
| 57024 | MVT NarrowSelVT = MVT::getVectorVT(SelVT.getVectorElementType(), NarrowElts); | |||
| 57025 | SDLoc DL(Ext); | |||
| 57026 | SDValue ExtCond = extract128BitVector(Sel.getOperand(0), ExtIdx, DAG, DL); | |||
| 57027 | SDValue ExtT = extract128BitVector(Sel.getOperand(1), ExtIdx, DAG, DL); | |||
| 57028 | SDValue ExtF = extract128BitVector(Sel.getOperand(2), ExtIdx, DAG, DL); | |||
| 57029 | SDValue NarrowSel = DAG.getSelect(DL, NarrowSelVT, ExtCond, ExtT, ExtF); | |||
| 57030 | return DAG.getBitcast(VT, NarrowSel); | |||
| 57031 | } | |||
| 57032 | ||||
| 57033 | static SDValue combineEXTRACT_SUBVECTOR(SDNode *N, SelectionDAG &DAG, | |||
| 57034 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 57035 | const X86Subtarget &Subtarget) { | |||
| 57036 | // For AVX1 only, if we are extracting from a 256-bit and+not (which will | |||
| 57037 | // eventually get combined/lowered into ANDNP) with a concatenated operand, | |||
| 57038 | // split the 'and' into 128-bit ops to avoid the concatenate and extract. | |||
| 57039 | // We let generic combining take over from there to simplify the | |||
| 57040 | // insert/extract and 'not'. | |||
| 57041 | // This pattern emerges during AVX1 legalization. We handle it before lowering | |||
| 57042 | // to avoid complications like splitting constant vector loads. | |||
| 57043 | ||||
| 57044 | // Capture the original wide type in the likely case that we need to bitcast | |||
| 57045 | // back to this type. | |||
| 57046 | if (!N->getValueType(0).isSimple()) | |||
| 57047 | return SDValue(); | |||
| 57048 | ||||
| 57049 | MVT VT = N->getSimpleValueType(0); | |||
| 57050 | SDValue InVec = N->getOperand(0); | |||
| 57051 | unsigned IdxVal = N->getConstantOperandVal(1); | |||
| 57052 | SDValue InVecBC = peekThroughBitcasts(InVec); | |||
| 57053 | EVT InVecVT = InVec.getValueType(); | |||
| 57054 | unsigned SizeInBits = VT.getSizeInBits(); | |||
| 57055 | unsigned InSizeInBits = InVecVT.getSizeInBits(); | |||
| 57056 | unsigned NumSubElts = VT.getVectorNumElements(); | |||
| 57057 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57058 | ||||
| 57059 | if (Subtarget.hasAVX() && !Subtarget.hasAVX2() && | |||
| 57060 | TLI.isTypeLegal(InVecVT) && | |||
| 57061 | InSizeInBits == 256 && InVecBC.getOpcode() == ISD::AND) { | |||
| 57062 | auto isConcatenatedNot = [](SDValue V) { | |||
| 57063 | V = peekThroughBitcasts(V); | |||
| 57064 | if (!isBitwiseNot(V)) | |||
| 57065 | return false; | |||
| 57066 | SDValue NotOp = V->getOperand(0); | |||
| 57067 | return peekThroughBitcasts(NotOp).getOpcode() == ISD::CONCAT_VECTORS; | |||
| 57068 | }; | |||
| 57069 | if (isConcatenatedNot(InVecBC.getOperand(0)) || | |||
| 57070 | isConcatenatedNot(InVecBC.getOperand(1))) { | |||
| 57071 | // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1 | |||
| 57072 | SDValue Concat = splitVectorIntBinary(InVecBC, DAG); | |||
| 57073 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, | |||
| 57074 | DAG.getBitcast(InVecVT, Concat), N->getOperand(1)); | |||
| 57075 | } | |||
| 57076 | } | |||
| 57077 | ||||
| 57078 | if (DCI.isBeforeLegalizeOps()) | |||
| 57079 | return SDValue(); | |||
| 57080 | ||||
| 57081 | if (SDValue V = narrowExtractedVectorSelect(N, DAG)) | |||
| 57082 | return V; | |||
| 57083 | ||||
| 57084 | if (ISD::isBuildVectorAllZeros(InVec.getNode())) | |||
| 57085 | return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); | |||
| 57086 | ||||
| 57087 | if (ISD::isBuildVectorAllOnes(InVec.getNode())) { | |||
| 57088 | if (VT.getScalarType() == MVT::i1) | |||
| 57089 | return DAG.getConstant(1, SDLoc(N), VT); | |||
| 57090 | return getOnesVector(VT, DAG, SDLoc(N)); | |||
| 57091 | } | |||
| 57092 | ||||
| 57093 | if (InVec.getOpcode() == ISD::BUILD_VECTOR) | |||
| 57094 | return DAG.getBuildVector(VT, SDLoc(N), | |||
| 57095 | InVec->ops().slice(IdxVal, NumSubElts)); | |||
| 57096 | ||||
| 57097 | // If we are extracting from an insert into a larger vector, replace with a | |||
| 57098 | // smaller insert if we don't access less than the original subvector. Don't | |||
| 57099 | // do this for i1 vectors. | |||
| 57100 | // TODO: Relax the matching indices requirement? | |||
| 57101 | if (VT.getVectorElementType() != MVT::i1 && | |||
| 57102 | InVec.getOpcode() == ISD::INSERT_SUBVECTOR && InVec.hasOneUse() && | |||
| 57103 | IdxVal == InVec.getConstantOperandVal(2) && | |||
| 57104 | InVec.getOperand(1).getValueSizeInBits() <= SizeInBits) { | |||
| 57105 | SDLoc DL(N); | |||
| 57106 | SDValue NewExt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, | |||
| 57107 | InVec.getOperand(0), N->getOperand(1)); | |||
| 57108 | unsigned NewIdxVal = InVec.getConstantOperandVal(2) - IdxVal; | |||
| 57109 | return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, NewExt, | |||
| 57110 | InVec.getOperand(1), | |||
| 57111 | DAG.getVectorIdxConstant(NewIdxVal, DL)); | |||
| 57112 | } | |||
| 57113 | ||||
| 57114 | // If we're extracting an upper subvector from a broadcast we should just | |||
| 57115 | // extract the lowest subvector instead which should allow | |||
| 57116 | // SimplifyDemandedVectorElts do more simplifications. | |||
| 57117 | if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST || | |||
| 57118 | InVec.getOpcode() == X86ISD::VBROADCAST_LOAD || | |||
| 57119 | DAG.isSplatValue(InVec, /*AllowUndefs*/ false))) | |||
| 57120 | return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); | |||
| 57121 | ||||
| 57122 | // If we're extracting a broadcasted subvector, just use the lowest subvector. | |||
| 57123 | if (IdxVal != 0 && InVec.getOpcode() == X86ISD::SUBV_BROADCAST_LOAD && | |||
| 57124 | cast<MemIntrinsicSDNode>(InVec)->getMemoryVT() == VT) | |||
| 57125 | return extractSubVector(InVec, 0, DAG, SDLoc(N), SizeInBits); | |||
| 57126 | ||||
| 57127 | // Attempt to extract from the source of a shuffle vector. | |||
| 57128 | if ((InSizeInBits % SizeInBits) == 0 && (IdxVal % NumSubElts) == 0) { | |||
| 57129 | SmallVector<int, 32> ShuffleMask; | |||
| 57130 | SmallVector<int, 32> ScaledMask; | |||
| 57131 | SmallVector<SDValue, 2> ShuffleInputs; | |||
| 57132 | unsigned NumSubVecs = InSizeInBits / SizeInBits; | |||
| 57133 | // Decode the shuffle mask and scale it so its shuffling subvectors. | |||
| 57134 | if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) && | |||
| 57135 | scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) { | |||
| 57136 | unsigned SubVecIdx = IdxVal / NumSubElts; | |||
| 57137 | if (ScaledMask[SubVecIdx] == SM_SentinelUndef) | |||
| 57138 | return DAG.getUNDEF(VT); | |||
| 57139 | if (ScaledMask[SubVecIdx] == SM_SentinelZero) | |||
| 57140 | return getZeroVector(VT, Subtarget, DAG, SDLoc(N)); | |||
| 57141 | SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs]; | |||
| 57142 | if (Src.getValueSizeInBits() == InSizeInBits) { | |||
| 57143 | unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs; | |||
| 57144 | unsigned SrcEltIdx = SrcSubVecIdx * NumSubElts; | |||
| 57145 | return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG, | |||
| 57146 | SDLoc(N), SizeInBits); | |||
| 57147 | } | |||
| 57148 | } | |||
| 57149 | } | |||
| 57150 | ||||
| 57151 | // If we're extracting the lowest subvector and we're the only user, | |||
| 57152 | // we may be able to perform this with a smaller vector width. | |||
| 57153 | unsigned InOpcode = InVec.getOpcode(); | |||
| 57154 | if (InVec.hasOneUse()) { | |||
| 57155 | if (IdxVal == 0 && VT == MVT::v2f64 && InVecVT == MVT::v4f64) { | |||
| 57156 | // v2f64 CVTDQ2PD(v4i32). | |||
| 57157 | if (InOpcode == ISD::SINT_TO_FP && | |||
| 57158 | InVec.getOperand(0).getValueType() == MVT::v4i32) { | |||
| 57159 | return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0)); | |||
| 57160 | } | |||
| 57161 | // v2f64 CVTUDQ2PD(v4i32). | |||
| 57162 | if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() && | |||
| 57163 | InVec.getOperand(0).getValueType() == MVT::v4i32) { | |||
| 57164 | return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0)); | |||
| 57165 | } | |||
| 57166 | // v2f64 CVTPS2PD(v4f32). | |||
| 57167 | if (InOpcode == ISD::FP_EXTEND && | |||
| 57168 | InVec.getOperand(0).getValueType() == MVT::v4f32) { | |||
| 57169 | return DAG.getNode(X86ISD::VFPEXT, SDLoc(N), VT, InVec.getOperand(0)); | |||
| 57170 | } | |||
| 57171 | } | |||
| 57172 | if (IdxVal == 0 && | |||
| 57173 | (ISD::isExtOpcode(InOpcode) || ISD::isExtVecInRegOpcode(InOpcode)) && | |||
| 57174 | (SizeInBits == 128 || SizeInBits == 256) && | |||
| 57175 | InVec.getOperand(0).getValueSizeInBits() >= SizeInBits) { | |||
| 57176 | SDLoc DL(N); | |||
| 57177 | SDValue Ext = InVec.getOperand(0); | |||
| 57178 | if (Ext.getValueSizeInBits() > SizeInBits) | |||
| 57179 | Ext = extractSubVector(Ext, 0, DAG, DL, SizeInBits); | |||
| 57180 | unsigned ExtOp = DAG.getOpcode_EXTEND_VECTOR_INREG(InOpcode); | |||
| 57181 | return DAG.getNode(ExtOp, DL, VT, Ext); | |||
| 57182 | } | |||
| 57183 | if (IdxVal == 0 && InOpcode == ISD::VSELECT && | |||
| 57184 | InVec.getOperand(0).getValueType().is256BitVector() && | |||
| 57185 | InVec.getOperand(1).getValueType().is256BitVector() && | |||
| 57186 | InVec.getOperand(2).getValueType().is256BitVector()) { | |||
| 57187 | SDLoc DL(N); | |||
| 57188 | SDValue Ext0 = extractSubVector(InVec.getOperand(0), 0, DAG, DL, 128); | |||
| 57189 | SDValue Ext1 = extractSubVector(InVec.getOperand(1), 0, DAG, DL, 128); | |||
| 57190 | SDValue Ext2 = extractSubVector(InVec.getOperand(2), 0, DAG, DL, 128); | |||
| 57191 | return DAG.getNode(InOpcode, DL, VT, Ext0, Ext1, Ext2); | |||
| 57192 | } | |||
| 57193 | if (IdxVal == 0 && InOpcode == ISD::TRUNCATE && Subtarget.hasVLX() && | |||
| 57194 | (VT.is128BitVector() || VT.is256BitVector())) { | |||
| 57195 | SDLoc DL(N); | |||
| 57196 | SDValue InVecSrc = InVec.getOperand(0); | |||
| 57197 | unsigned Scale = InVecSrc.getValueSizeInBits() / InSizeInBits; | |||
| 57198 | SDValue Ext = extractSubVector(InVecSrc, 0, DAG, DL, Scale * SizeInBits); | |||
| 57199 | return DAG.getNode(InOpcode, DL, VT, Ext); | |||
| 57200 | } | |||
| 57201 | if (InOpcode == X86ISD::MOVDDUP && | |||
| 57202 | (VT.is128BitVector() || VT.is256BitVector())) { | |||
| 57203 | SDLoc DL(N); | |||
| 57204 | SDValue Ext0 = | |||
| 57205 | extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); | |||
| 57206 | return DAG.getNode(InOpcode, DL, VT, Ext0); | |||
| 57207 | } | |||
| 57208 | } | |||
| 57209 | ||||
| 57210 | // Always split vXi64 logical shifts where we're extracting the upper 32-bits | |||
| 57211 | // as this is very likely to fold into a shuffle/truncation. | |||
| 57212 | if ((InOpcode == X86ISD::VSHLI || InOpcode == X86ISD::VSRLI) && | |||
| 57213 | InVecVT.getScalarSizeInBits() == 64 && | |||
| 57214 | InVec.getConstantOperandAPInt(1) == 32) { | |||
| 57215 | SDLoc DL(N); | |||
| 57216 | SDValue Ext = | |||
| 57217 | extractSubVector(InVec.getOperand(0), IdxVal, DAG, DL, SizeInBits); | |||
| 57218 | return DAG.getNode(InOpcode, DL, VT, Ext, InVec.getOperand(1)); | |||
| 57219 | } | |||
| 57220 | ||||
| 57221 | return SDValue(); | |||
| 57222 | } | |||
| 57223 | ||||
| 57224 | static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) { | |||
| 57225 | EVT VT = N->getValueType(0); | |||
| 57226 | SDValue Src = N->getOperand(0); | |||
| 57227 | SDLoc DL(N); | |||
| 57228 | ||||
| 57229 | // If this is a scalar to vector to v1i1 from an AND with 1, bypass the and. | |||
| 57230 | // This occurs frequently in our masked scalar intrinsic code and our | |||
| 57231 | // floating point select lowering with AVX512. | |||
| 57232 | // TODO: SimplifyDemandedBits instead? | |||
| 57233 | if (VT == MVT::v1i1 && Src.getOpcode() == ISD::AND && Src.hasOneUse() && | |||
| 57234 | isOneConstant(Src.getOperand(1))) | |||
| 57235 | return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Src.getOperand(0)); | |||
| 57236 | ||||
| 57237 | // Combine scalar_to_vector of an extract_vector_elt into an extract_subvec. | |||
| 57238 | if (VT == MVT::v1i1 && Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT && | |||
| 57239 | Src.hasOneUse() && Src.getOperand(0).getValueType().isVector() && | |||
| 57240 | Src.getOperand(0).getValueType().getVectorElementType() == MVT::i1) | |||
| 57241 | if (auto *C = dyn_cast<ConstantSDNode>(Src.getOperand(1))) | |||
| 57242 | if (C->isZero()) | |||
| 57243 | return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src.getOperand(0), | |||
| 57244 | Src.getOperand(1)); | |||
| 57245 | ||||
| 57246 | // Reduce v2i64 to v4i32 if we don't need the upper bits or are known zero. | |||
| 57247 | // TODO: Move to DAGCombine/SimplifyDemandedBits? | |||
| 57248 | if ((VT == MVT::v2i64 || VT == MVT::v2f64) && Src.hasOneUse()) { | |||
| 57249 | auto IsExt64 = [&DAG](SDValue Op, bool IsZeroExt) { | |||
| 57250 | if (Op.getValueType() != MVT::i64) | |||
| 57251 | return SDValue(); | |||
| 57252 | unsigned Opc = IsZeroExt ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND; | |||
| 57253 | if (Op.getOpcode() == Opc && | |||
| 57254 | Op.getOperand(0).getScalarValueSizeInBits() <= 32) | |||
| 57255 | return Op.getOperand(0); | |||
| 57256 | unsigned Ext = IsZeroExt ? ISD::ZEXTLOAD : ISD::EXTLOAD; | |||
| 57257 | if (auto *Ld = dyn_cast<LoadSDNode>(Op)) | |||
| 57258 | if (Ld->getExtensionType() == Ext && | |||
| 57259 | Ld->getMemoryVT().getScalarSizeInBits() <= 32) | |||
| 57260 | return Op; | |||
| 57261 | if (IsZeroExt) { | |||
| 57262 | KnownBits Known = DAG.computeKnownBits(Op); | |||
| 57263 | if (!Known.isConstant() && Known.countMinLeadingZeros() >= 32) | |||
| 57264 | return Op; | |||
| 57265 | } | |||
| 57266 | return SDValue(); | |||
| 57267 | }; | |||
| 57268 | ||||
| 57269 | if (SDValue AnyExt = IsExt64(peekThroughOneUseBitcasts(Src), false)) | |||
| 57270 | return DAG.getBitcast( | |||
| 57271 | VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, | |||
| 57272 | DAG.getAnyExtOrTrunc(AnyExt, DL, MVT::i32))); | |||
| 57273 | ||||
| 57274 | if (SDValue ZeroExt = IsExt64(peekThroughOneUseBitcasts(Src), true)) | |||
| 57275 | return DAG.getBitcast( | |||
| 57276 | VT, | |||
| 57277 | DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v4i32, | |||
| 57278 | DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32, | |||
| 57279 | DAG.getZExtOrTrunc(ZeroExt, DL, MVT::i32)))); | |||
| 57280 | } | |||
| 57281 | ||||
| 57282 | // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ. | |||
| 57283 | if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST && | |||
| 57284 | Src.getOperand(0).getValueType() == MVT::x86mmx) | |||
| 57285 | return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0)); | |||
| 57286 | ||||
| 57287 | // See if we're broadcasting the scalar value, in which case just reuse that. | |||
| 57288 | // Ensure the same SDValue from the SDNode use is being used. | |||
| 57289 | if (VT.getScalarType() == Src.getValueType()) | |||
| 57290 | for (SDNode *User : Src->uses()) | |||
| 57291 | if (User->getOpcode() == X86ISD::VBROADCAST && | |||
| 57292 | Src == User->getOperand(0)) { | |||
| 57293 | unsigned SizeInBits = VT.getFixedSizeInBits(); | |||
| 57294 | unsigned BroadcastSizeInBits = | |||
| 57295 | User->getValueSizeInBits(0).getFixedValue(); | |||
| 57296 | if (BroadcastSizeInBits == SizeInBits) | |||
| 57297 | return SDValue(User, 0); | |||
| 57298 | if (BroadcastSizeInBits > SizeInBits) | |||
| 57299 | return extractSubVector(SDValue(User, 0), 0, DAG, DL, SizeInBits); | |||
| 57300 | // TODO: Handle BroadcastSizeInBits < SizeInBits when we have test | |||
| 57301 | // coverage. | |||
| 57302 | } | |||
| 57303 | ||||
| 57304 | return SDValue(); | |||
| 57305 | } | |||
| 57306 | ||||
| 57307 | // Simplify PMULDQ and PMULUDQ operations. | |||
| 57308 | static SDValue combinePMULDQ(SDNode *N, SelectionDAG &DAG, | |||
| 57309 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 57310 | const X86Subtarget &Subtarget) { | |||
| 57311 | SDValue LHS = N->getOperand(0); | |||
| 57312 | SDValue RHS = N->getOperand(1); | |||
| 57313 | ||||
| 57314 | // Canonicalize constant to RHS. | |||
| 57315 | if (DAG.isConstantIntBuildVectorOrConstantInt(LHS) && | |||
| 57316 | !DAG.isConstantIntBuildVectorOrConstantInt(RHS)) | |||
| 57317 | return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), RHS, LHS); | |||
| 57318 | ||||
| 57319 | // Multiply by zero. | |||
| 57320 | // Don't return RHS as it may contain UNDEFs. | |||
| 57321 | if (ISD::isBuildVectorAllZeros(RHS.getNode())) | |||
| 57322 | return DAG.getConstant(0, SDLoc(N), N->getValueType(0)); | |||
| 57323 | ||||
| 57324 | // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element. | |||
| 57325 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57326 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(64), DCI)) | |||
| 57327 | return SDValue(N, 0); | |||
| 57328 | ||||
| 57329 | // If the input is an extend_invec and the SimplifyDemandedBits call didn't | |||
| 57330 | // convert it to any_extend_invec, due to the LegalOperations check, do the | |||
| 57331 | // conversion directly to a vector shuffle manually. This exposes combine | |||
| 57332 | // opportunities missed by combineEXTEND_VECTOR_INREG not calling | |||
| 57333 | // combineX86ShufflesRecursively on SSE4.1 targets. | |||
| 57334 | // FIXME: This is basically a hack around several other issues related to | |||
| 57335 | // ANY_EXTEND_VECTOR_INREG. | |||
| 57336 | if (N->getValueType(0) == MVT::v2i64 && LHS.hasOneUse() && | |||
| 57337 | (LHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || | |||
| 57338 | LHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && | |||
| 57339 | LHS.getOperand(0).getValueType() == MVT::v4i32) { | |||
| 57340 | SDLoc dl(N); | |||
| 57341 | LHS = DAG.getVectorShuffle(MVT::v4i32, dl, LHS.getOperand(0), | |||
| 57342 | LHS.getOperand(0), { 0, -1, 1, -1 }); | |||
| 57343 | LHS = DAG.getBitcast(MVT::v2i64, LHS); | |||
| 57344 | return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); | |||
| 57345 | } | |||
| 57346 | if (N->getValueType(0) == MVT::v2i64 && RHS.hasOneUse() && | |||
| 57347 | (RHS.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG || | |||
| 57348 | RHS.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG) && | |||
| 57349 | RHS.getOperand(0).getValueType() == MVT::v4i32) { | |||
| 57350 | SDLoc dl(N); | |||
| 57351 | RHS = DAG.getVectorShuffle(MVT::v4i32, dl, RHS.getOperand(0), | |||
| 57352 | RHS.getOperand(0), { 0, -1, 1, -1 }); | |||
| 57353 | RHS = DAG.getBitcast(MVT::v2i64, RHS); | |||
| 57354 | return DAG.getNode(N->getOpcode(), dl, MVT::v2i64, LHS, RHS); | |||
| 57355 | } | |||
| 57356 | ||||
| 57357 | return SDValue(); | |||
| 57358 | } | |||
| 57359 | ||||
| 57360 | // Simplify VPMADDUBSW/VPMADDWD operations. | |||
| 57361 | static SDValue combineVPMADD(SDNode *N, SelectionDAG &DAG, | |||
| 57362 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 57363 | EVT VT = N->getValueType(0); | |||
| 57364 | SDValue LHS = N->getOperand(0); | |||
| 57365 | SDValue RHS = N->getOperand(1); | |||
| 57366 | ||||
| 57367 | // Multiply by zero. | |||
| 57368 | // Don't return LHS/RHS as it may contain UNDEFs. | |||
| 57369 | if (ISD::isBuildVectorAllZeros(LHS.getNode()) || | |||
| 57370 | ISD::isBuildVectorAllZeros(RHS.getNode())) | |||
| 57371 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 57372 | ||||
| 57373 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57374 | APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); | |||
| 57375 | if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) | |||
| 57376 | return SDValue(N, 0); | |||
| 57377 | ||||
| 57378 | return SDValue(); | |||
| 57379 | } | |||
| 57380 | ||||
| 57381 | static SDValue combineEXTEND_VECTOR_INREG(SDNode *N, SelectionDAG &DAG, | |||
| 57382 | TargetLowering::DAGCombinerInfo &DCI, | |||
| 57383 | const X86Subtarget &Subtarget) { | |||
| 57384 | EVT VT = N->getValueType(0); | |||
| 57385 | SDValue In = N->getOperand(0); | |||
| 57386 | unsigned Opcode = N->getOpcode(); | |||
| 57387 | unsigned InOpcode = In.getOpcode(); | |||
| 57388 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57389 | SDLoc DL(N); | |||
| 57390 | ||||
| 57391 | // Try to merge vector loads and extend_inreg to an extload. | |||
| 57392 | if (!DCI.isBeforeLegalizeOps() && ISD::isNormalLoad(In.getNode()) && | |||
| 57393 | In.hasOneUse()) { | |||
| 57394 | auto *Ld = cast<LoadSDNode>(In); | |||
| 57395 | if (Ld->isSimple()) { | |||
| 57396 | MVT SVT = In.getSimpleValueType().getVectorElementType(); | |||
| 57397 | ISD::LoadExtType Ext = Opcode == ISD::SIGN_EXTEND_VECTOR_INREG | |||
| 57398 | ? ISD::SEXTLOAD | |||
| 57399 | : ISD::ZEXTLOAD; | |||
| 57400 | EVT MemVT = VT.changeVectorElementType(SVT); | |||
| 57401 | if (TLI.isLoadExtLegal(Ext, VT, MemVT)) { | |||
| 57402 | SDValue Load = DAG.getExtLoad( | |||
| 57403 | Ext, DL, VT, Ld->getChain(), Ld->getBasePtr(), Ld->getPointerInfo(), | |||
| 57404 | MemVT, Ld->getOriginalAlign(), Ld->getMemOperand()->getFlags()); | |||
| 57405 | DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1)); | |||
| 57406 | return Load; | |||
| 57407 | } | |||
| 57408 | } | |||
| 57409 | } | |||
| 57410 | ||||
| 57411 | // Fold EXTEND_VECTOR_INREG(EXTEND_VECTOR_INREG(X)) -> EXTEND_VECTOR_INREG(X). | |||
| 57412 | if (Opcode == InOpcode) | |||
| 57413 | return DAG.getNode(Opcode, DL, VT, In.getOperand(0)); | |||
| 57414 | ||||
| 57415 | // Fold EXTEND_VECTOR_INREG(EXTRACT_SUBVECTOR(EXTEND(X),0)) | |||
| 57416 | // -> EXTEND_VECTOR_INREG(X). | |||
| 57417 | // TODO: Handle non-zero subvector indices. | |||
| 57418 | if (InOpcode == ISD::EXTRACT_SUBVECTOR && In.getConstantOperandVal(1) == 0 && | |||
| 57419 | In.getOperand(0).getOpcode() == DAG.getOpcode_EXTEND(Opcode) && | |||
| 57420 | In.getOperand(0).getOperand(0).getValueSizeInBits() == | |||
| 57421 | In.getValueSizeInBits()) | |||
| 57422 | return DAG.getNode(Opcode, DL, VT, In.getOperand(0).getOperand(0)); | |||
| 57423 | ||||
| 57424 | // Fold EXTEND_VECTOR_INREG(BUILD_VECTOR(X,Y,?,?)) -> BUILD_VECTOR(X,0,Y,0). | |||
| 57425 | // TODO: Move to DAGCombine? | |||
| 57426 | if (!DCI.isBeforeLegalizeOps() && Opcode == ISD::ZERO_EXTEND_VECTOR_INREG && | |||
| 57427 | In.getOpcode() == ISD::BUILD_VECTOR && In.hasOneUse() && | |||
| 57428 | In.getValueSizeInBits() == VT.getSizeInBits()) { | |||
| 57429 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 57430 | unsigned Scale = VT.getScalarSizeInBits() / In.getScalarValueSizeInBits(); | |||
| 57431 | EVT EltVT = In.getOperand(0).getValueType(); | |||
| 57432 | SmallVector<SDValue> Elts(Scale * NumElts, DAG.getConstant(0, DL, EltVT)); | |||
| 57433 | for (unsigned I = 0; I != NumElts; ++I) | |||
| 57434 | Elts[I * Scale] = In.getOperand(I); | |||
| 57435 | return DAG.getBitcast(VT, DAG.getBuildVector(In.getValueType(), DL, Elts)); | |||
| 57436 | } | |||
| 57437 | ||||
| 57438 | // Attempt to combine as a shuffle on SSE41+ targets. | |||
| 57439 | if ((Opcode == ISD::ANY_EXTEND_VECTOR_INREG || | |||
| 57440 | Opcode == ISD::ZERO_EXTEND_VECTOR_INREG) && | |||
| 57441 | Subtarget.hasSSE41()) { | |||
| 57442 | SDValue Op(N, 0); | |||
| 57443 | if (TLI.isTypeLegal(VT) && TLI.isTypeLegal(In.getValueType())) | |||
| 57444 | if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) | |||
| 57445 | return Res; | |||
| 57446 | } | |||
| 57447 | ||||
| 57448 | return SDValue(); | |||
| 57449 | } | |||
| 57450 | ||||
| 57451 | static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG, | |||
| 57452 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 57453 | EVT VT = N->getValueType(0); | |||
| 57454 | ||||
| 57455 | if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode())) | |||
| 57456 | return DAG.getConstant(0, SDLoc(N), VT); | |||
| 57457 | ||||
| 57458 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57459 | APInt DemandedElts = APInt::getAllOnes(VT.getVectorNumElements()); | |||
| 57460 | if (TLI.SimplifyDemandedVectorElts(SDValue(N, 0), DemandedElts, DCI)) | |||
| 57461 | return SDValue(N, 0); | |||
| 57462 | ||||
| 57463 | return SDValue(); | |||
| 57464 | } | |||
| 57465 | ||||
| 57466 | // Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS. | |||
| 57467 | // Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce | |||
| 57468 | // extra instructions between the conversion due to going to scalar and back. | |||
| 57469 | static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG, | |||
| 57470 | const X86Subtarget &Subtarget) { | |||
| 57471 | if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) | |||
| 57472 | return SDValue(); | |||
| 57473 | ||||
| 57474 | if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16) | |||
| 57475 | return SDValue(); | |||
| 57476 | ||||
| 57477 | if (N->getValueType(0) != MVT::f32 || | |||
| 57478 | N->getOperand(0).getOperand(0).getValueType() != MVT::f32) | |||
| 57479 | return SDValue(); | |||
| 57480 | ||||
| 57481 | SDLoc dl(N); | |||
| 57482 | SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, | |||
| 57483 | N->getOperand(0).getOperand(0)); | |||
| 57484 | Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res, | |||
| 57485 | DAG.getTargetConstant(4, dl, MVT::i32)); | |||
| 57486 | Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res); | |||
| 57487 | return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res, | |||
| 57488 | DAG.getIntPtrConstant(0, dl)); | |||
| 57489 | } | |||
| 57490 | ||||
| 57491 | static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG, | |||
| 57492 | const X86Subtarget &Subtarget) { | |||
| 57493 | if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) | |||
| 57494 | return SDValue(); | |||
| 57495 | ||||
| 57496 | if (Subtarget.hasFP16()) | |||
| 57497 | return SDValue(); | |||
| 57498 | ||||
| 57499 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 57500 | EVT VT = N->getValueType(0); | |||
| 57501 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 57502 | EVT SrcVT = Src.getValueType(); | |||
| 57503 | ||||
| 57504 | if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16) | |||
| 57505 | return SDValue(); | |||
| 57506 | ||||
| 57507 | if (VT.getVectorElementType() != MVT::f32 && | |||
| 57508 | VT.getVectorElementType() != MVT::f64) | |||
| 57509 | return SDValue(); | |||
| 57510 | ||||
| 57511 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 57512 | if (NumElts == 1 || !isPowerOf2_32(NumElts)) | |||
| 57513 | return SDValue(); | |||
| 57514 | ||||
| 57515 | SDLoc dl(N); | |||
| 57516 | ||||
| 57517 | // Convert the input to vXi16. | |||
| 57518 | EVT IntVT = SrcVT.changeVectorElementTypeToInteger(); | |||
| 57519 | Src = DAG.getBitcast(IntVT, Src); | |||
| 57520 | ||||
| 57521 | // Widen to at least 8 input elements. | |||
| 57522 | if (NumElts < 8) { | |||
| 57523 | unsigned NumConcats = 8 / NumElts; | |||
| 57524 | SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT) | |||
| 57525 | : DAG.getConstant(0, dl, IntVT); | |||
| 57526 | SmallVector<SDValue, 4> Ops(NumConcats, Fill); | |||
| 57527 | Ops[0] = Src; | |||
| 57528 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops); | |||
| 57529 | } | |||
| 57530 | ||||
| 57531 | // Destination is vXf32 with at least 4 elements. | |||
| 57532 | EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32, | |||
| 57533 | std::max(4U, NumElts)); | |||
| 57534 | SDValue Cvt, Chain; | |||
| 57535 | if (IsStrict) { | |||
| 57536 | Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other}, | |||
| 57537 | {N->getOperand(0), Src}); | |||
| 57538 | Chain = Cvt.getValue(1); | |||
| 57539 | } else { | |||
| 57540 | Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src); | |||
| 57541 | } | |||
| 57542 | ||||
| 57543 | if (NumElts < 4) { | |||
| 57544 | assert(NumElts == 2 && "Unexpected size")(static_cast <bool> (NumElts == 2 && "Unexpected size" ) ? void (0) : __assert_fail ("NumElts == 2 && \"Unexpected size\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57544, __extension__ __PRETTY_FUNCTION__)); | |||
| 57545 | Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt, | |||
| 57546 | DAG.getIntPtrConstant(0, dl)); | |||
| 57547 | } | |||
| 57548 | ||||
| 57549 | if (IsStrict) { | |||
| 57550 | // Extend to the original VT if necessary. | |||
| 57551 | if (Cvt.getValueType() != VT) { | |||
| 57552 | Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other}, | |||
| 57553 | {Chain, Cvt}); | |||
| 57554 | Chain = Cvt.getValue(1); | |||
| 57555 | } | |||
| 57556 | return DAG.getMergeValues({Cvt, Chain}, dl); | |||
| 57557 | } | |||
| 57558 | ||||
| 57559 | // Extend to the original VT if necessary. | |||
| 57560 | return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt); | |||
| 57561 | } | |||
| 57562 | ||||
| 57563 | // Try to find a larger VBROADCAST_LOAD/SUBV_BROADCAST_LOAD that we can extract | |||
| 57564 | // from. Limit this to cases where the loads have the same input chain and the | |||
| 57565 | // output chains are unused. This avoids any memory ordering issues. | |||
| 57566 | static SDValue combineBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG, | |||
| 57567 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 57568 | assert((N->getOpcode() == X86ISD::VBROADCAST_LOAD ||(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__ __PRETTY_FUNCTION__)) | |||
| 57569 | N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) &&(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__ __PRETTY_FUNCTION__)) | |||
| 57570 | "Unknown broadcast load type")(static_cast <bool> ((N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && "Unknown broadcast load type") ? void (0) : __assert_fail ("(N->getOpcode() == X86ISD::VBROADCAST_LOAD || N->getOpcode() == X86ISD::SUBV_BROADCAST_LOAD) && \"Unknown broadcast load type\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57570, __extension__ __PRETTY_FUNCTION__)); | |||
| 57571 | ||||
| 57572 | // Only do this if the chain result is unused. | |||
| 57573 | if (N->hasAnyUseOfValue(1)) | |||
| 57574 | return SDValue(); | |||
| 57575 | ||||
| 57576 | auto *MemIntrin = cast<MemIntrinsicSDNode>(N); | |||
| 57577 | ||||
| 57578 | SDValue Ptr = MemIntrin->getBasePtr(); | |||
| 57579 | SDValue Chain = MemIntrin->getChain(); | |||
| 57580 | EVT VT = N->getSimpleValueType(0); | |||
| 57581 | EVT MemVT = MemIntrin->getMemoryVT(); | |||
| 57582 | ||||
| 57583 | // Look at other users of our base pointer and try to find a wider broadcast. | |||
| 57584 | // The input chain and the size of the memory VT must match. | |||
| 57585 | for (SDNode *User : Ptr->uses()) | |||
| 57586 | if (User != N && User->getOpcode() == N->getOpcode() && | |||
| 57587 | cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr && | |||
| 57588 | cast<MemIntrinsicSDNode>(User)->getChain() == Chain && | |||
| 57589 | cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() == | |||
| 57590 | MemVT.getSizeInBits() && | |||
| 57591 | !User->hasAnyUseOfValue(1) && | |||
| 57592 | User->getValueSizeInBits(0).getFixedValue() > VT.getFixedSizeInBits()) { | |||
| 57593 | SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N), | |||
| 57594 | VT.getSizeInBits()); | |||
| 57595 | Extract = DAG.getBitcast(VT, Extract); | |||
| 57596 | return DCI.CombineTo(N, Extract, SDValue(User, 1)); | |||
| 57597 | } | |||
| 57598 | ||||
| 57599 | return SDValue(); | |||
| 57600 | } | |||
| 57601 | ||||
| 57602 | static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG, | |||
| 57603 | const X86Subtarget &Subtarget) { | |||
| 57604 | if (!Subtarget.hasF16C() || Subtarget.useSoftFloat()) | |||
| 57605 | return SDValue(); | |||
| 57606 | ||||
| 57607 | bool IsStrict = N->isStrictFPOpcode(); | |||
| 57608 | EVT VT = N->getValueType(0); | |||
| 57609 | SDValue Src = N->getOperand(IsStrict ? 1 : 0); | |||
| 57610 | EVT SrcVT = Src.getValueType(); | |||
| 57611 | ||||
| 57612 | if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 || | |||
| 57613 | SrcVT.getVectorElementType() != MVT::f32) | |||
| 57614 | return SDValue(); | |||
| 57615 | ||||
| 57616 | SDLoc dl(N); | |||
| 57617 | ||||
| 57618 | SDValue Cvt, Chain; | |||
| 57619 | unsigned NumElts = VT.getVectorNumElements(); | |||
| 57620 | if (Subtarget.hasFP16()) { | |||
| 57621 | // Combine (v8f16 fp_round(concat_vectors(v4f32 (xint_to_fp v4i64), ..))) | |||
| 57622 | // into (v8f16 vector_shuffle(v8f16 (CVTXI2P v4i64), ..)) | |||
| 57623 | if (NumElts == 8 && Src.getOpcode() == ISD::CONCAT_VECTORS) { | |||
| 57624 | SDValue Cvt0, Cvt1; | |||
| 57625 | SDValue Op0 = Src.getOperand(0); | |||
| 57626 | SDValue Op1 = Src.getOperand(1); | |||
| 57627 | bool IsOp0Strict = Op0->isStrictFPOpcode(); | |||
| 57628 | if (Op0.getOpcode() != Op1.getOpcode() || | |||
| 57629 | Op0.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64 || | |||
| 57630 | Op1.getOperand(IsOp0Strict ? 1 : 0).getValueType() != MVT::v4i64) { | |||
| 57631 | return SDValue(); | |||
| 57632 | } | |||
| 57633 | int Mask[8] = {0, 1, 2, 3, 8, 9, 10, 11}; | |||
| 57634 | if (IsStrict) { | |||
| 57635 | assert(IsOp0Strict && "Op0 must be strict node")(static_cast <bool> (IsOp0Strict && "Op0 must be strict node" ) ? void (0) : __assert_fail ("IsOp0Strict && \"Op0 must be strict node\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 57635, __extension__ __PRETTY_FUNCTION__)); | |||
| 57636 | unsigned Opc = Op0.getOpcode() == ISD::STRICT_SINT_TO_FP | |||
| 57637 | ? X86ISD::STRICT_CVTSI2P | |||
| 57638 | : X86ISD::STRICT_CVTUI2P; | |||
| 57639 | Cvt0 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, | |||
| 57640 | {Op0.getOperand(0), Op0.getOperand(1)}); | |||
| 57641 | Cvt1 = DAG.getNode(Opc, dl, {MVT::v8f16, MVT::Other}, | |||
| 57642 | {Op1.getOperand(0), Op1.getOperand(1)}); | |||
| 57643 | Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); | |||
| 57644 | return DAG.getMergeValues({Cvt, Cvt0.getValue(1)}, dl); | |||
| 57645 | } | |||
| 57646 | unsigned Opc = Op0.getOpcode() == ISD::SINT_TO_FP ? X86ISD::CVTSI2P | |||
| 57647 | : X86ISD::CVTUI2P; | |||
| 57648 | Cvt0 = DAG.getNode(Opc, dl, MVT::v8f16, Op0.getOperand(0)); | |||
| 57649 | Cvt1 = DAG.getNode(Opc, dl, MVT::v8f16, Op1.getOperand(0)); | |||
| 57650 | return Cvt = DAG.getVectorShuffle(MVT::v8f16, dl, Cvt0, Cvt1, Mask); | |||
| 57651 | } | |||
| 57652 | return SDValue(); | |||
| 57653 | } | |||
| 57654 | ||||
| 57655 | if (NumElts == 1 || !isPowerOf2_32(NumElts)) | |||
| 57656 | return SDValue(); | |||
| 57657 | ||||
| 57658 | // Widen to at least 4 input elements. | |||
| 57659 | if (NumElts < 4) | |||
| 57660 | Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src, | |||
| 57661 | DAG.getConstantFP(0.0, dl, SrcVT)); | |||
| 57662 | ||||
| 57663 | // Destination is v8i16 with at least 8 elements. | |||
| 57664 | EVT CvtVT = | |||
| 57665 | EVT::getVectorVT(*DAG.getContext(), MVT::i16, std::max(8U, NumElts)); | |||
| 57666 | SDValue Rnd = DAG.getTargetConstant(4, dl, MVT::i32); | |||
| 57667 | if (IsStrict) { | |||
| 57668 | Cvt = DAG.getNode(X86ISD::STRICT_CVTPS2PH, dl, {CvtVT, MVT::Other}, | |||
| 57669 | {N->getOperand(0), Src, Rnd}); | |||
| 57670 | Chain = Cvt.getValue(1); | |||
| 57671 | } else { | |||
| 57672 | Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src, Rnd); | |||
| 57673 | } | |||
| 57674 | ||||
| 57675 | // Extract down to real number of elements. | |||
| 57676 | if (NumElts < 8) { | |||
| 57677 | EVT IntVT = VT.changeVectorElementTypeToInteger(); | |||
| 57678 | Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt, | |||
| 57679 | DAG.getIntPtrConstant(0, dl)); | |||
| 57680 | } | |||
| 57681 | ||||
| 57682 | Cvt = DAG.getBitcast(VT, Cvt); | |||
| 57683 | ||||
| 57684 | if (IsStrict) | |||
| 57685 | return DAG.getMergeValues({Cvt, Chain}, dl); | |||
| 57686 | ||||
| 57687 | return Cvt; | |||
| 57688 | } | |||
| 57689 | ||||
| 57690 | static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) { | |||
| 57691 | SDValue Src = N->getOperand(0); | |||
| 57692 | ||||
| 57693 | // Turn MOVDQ2Q+simple_load into an mmx load. | |||
| 57694 | if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) { | |||
| 57695 | LoadSDNode *LN = cast<LoadSDNode>(Src.getNode()); | |||
| 57696 | ||||
| 57697 | if (LN->isSimple()) { | |||
| 57698 | SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(), | |||
| 57699 | LN->getBasePtr(), | |||
| 57700 | LN->getPointerInfo(), | |||
| 57701 | LN->getOriginalAlign(), | |||
| 57702 | LN->getMemOperand()->getFlags()); | |||
| 57703 | DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1)); | |||
| 57704 | return NewLd; | |||
| 57705 | } | |||
| 57706 | } | |||
| 57707 | ||||
| 57708 | return SDValue(); | |||
| 57709 | } | |||
| 57710 | ||||
| 57711 | static SDValue combinePDEP(SDNode *N, SelectionDAG &DAG, | |||
| 57712 | TargetLowering::DAGCombinerInfo &DCI) { | |||
| 57713 | unsigned NumBits = N->getSimpleValueType(0).getSizeInBits(); | |||
| 57714 | const TargetLowering &TLI = DAG.getTargetLoweringInfo(); | |||
| 57715 | if (TLI.SimplifyDemandedBits(SDValue(N, 0), APInt::getAllOnes(NumBits), DCI)) | |||
| 57716 | return SDValue(N, 0); | |||
| 57717 | ||||
| 57718 | return SDValue(); | |||
| 57719 | } | |||
| 57720 | ||||
| 57721 | SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, | |||
| 57722 | DAGCombinerInfo &DCI) const { | |||
| 57723 | SelectionDAG &DAG = DCI.DAG; | |||
| 57724 | switch (N->getOpcode()) { | |||
| 57725 | default: break; | |||
| 57726 | case ISD::SCALAR_TO_VECTOR: | |||
| 57727 | return combineScalarToVector(N, DAG); | |||
| 57728 | case ISD::EXTRACT_VECTOR_ELT: | |||
| 57729 | case X86ISD::PEXTRW: | |||
| 57730 | case X86ISD::PEXTRB: | |||
| 57731 | return combineExtractVectorElt(N, DAG, DCI, Subtarget); | |||
| 57732 | case ISD::CONCAT_VECTORS: | |||
| 57733 | return combineCONCAT_VECTORS(N, DAG, DCI, Subtarget); | |||
| 57734 | case ISD::INSERT_SUBVECTOR: | |||
| 57735 | return combineINSERT_SUBVECTOR(N, DAG, DCI, Subtarget); | |||
| 57736 | case ISD::EXTRACT_SUBVECTOR: | |||
| 57737 | return combineEXTRACT_SUBVECTOR(N, DAG, DCI, Subtarget); | |||
| 57738 | case ISD::VSELECT: | |||
| 57739 | case ISD::SELECT: | |||
| 57740 | case X86ISD::BLENDV: return combineSelect(N, DAG, DCI, Subtarget); | |||
| 57741 | case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget); | |||
| 57742 | case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget); | |||
| 57743 | case X86ISD::CMP: return combineCMP(N, DAG); | |||
| 57744 | case ISD::ADD: return combineAdd(N, DAG, DCI, Subtarget); | |||
| 57745 | case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget); | |||
| 57746 | case X86ISD::ADD: | |||
| 57747 | case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI); | |||
| 57748 | case X86ISD::SBB: return combineSBB(N, DAG); | |||
| 57749 | case X86ISD::ADC: return combineADC(N, DAG, DCI); | |||
| 57750 | case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget); | |||
| 57751 | case ISD::SHL: return combineShiftLeft(N, DAG); | |||
| 57752 | case ISD::SRA: return combineShiftRightArithmetic(N, DAG, Subtarget); | |||
| 57753 | case ISD::SRL: return combineShiftRightLogical(N, DAG, DCI, Subtarget); | |||
| 57754 | case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget); | |||
| 57755 | case ISD::OR: return combineOr(N, DAG, DCI, Subtarget); | |||
| 57756 | case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget); | |||
| 57757 | case X86ISD::BEXTR: | |||
| 57758 | case X86ISD::BEXTRI: return combineBEXTR(N, DAG, DCI, Subtarget); | |||
| 57759 | case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget); | |||
| 57760 | case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget); | |||
| 57761 | case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget); | |||
| 57762 | case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget); | |||
| 57763 | case X86ISD::VEXTRACT_STORE: | |||
| 57764 | return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget); | |||
| 57765 | case ISD::SINT_TO_FP: | |||
| 57766 | case ISD::STRICT_SINT_TO_FP: | |||
| 57767 | return combineSIntToFP(N, DAG, DCI, Subtarget); | |||
| 57768 | case ISD::UINT_TO_FP: | |||
| 57769 | case ISD::STRICT_UINT_TO_FP: | |||
| 57770 | return combineUIntToFP(N, DAG, Subtarget); | |||
| 57771 | case ISD::FADD: | |||
| 57772 | case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget); | |||
| 57773 | case X86ISD::VFCMULC: | |||
| 57774 | case X86ISD::VFMULC: return combineFMulcFCMulc(N, DAG, Subtarget); | |||
| 57775 | case ISD::FNEG: return combineFneg(N, DAG, DCI, Subtarget); | |||
| 57776 | case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget); | |||
| 57777 | case X86ISD::VTRUNC: return combineVTRUNC(N, DAG, DCI); | |||
| 57778 | case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget); | |||
| 57779 | case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget); | |||
| 57780 | case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget); | |||
| 57781 | case X86ISD::FXOR: | |||
| 57782 | case X86ISD::FOR: return combineFOr(N, DAG, DCI, Subtarget); | |||
| 57783 | case X86ISD::FMIN: | |||
| 57784 | case X86ISD::FMAX: return combineFMinFMax(N, DAG); | |||
| 57785 | case ISD::FMINNUM: | |||
| 57786 | case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget); | |||
| 57787 | case X86ISD::CVTSI2P: | |||
| 57788 | case X86ISD::CVTUI2P: return combineX86INT_TO_FP(N, DAG, DCI); | |||
| 57789 | case X86ISD::CVTP2SI: | |||
| 57790 | case X86ISD::CVTP2UI: | |||
| 57791 | case X86ISD::STRICT_CVTTP2SI: | |||
| 57792 | case X86ISD::CVTTP2SI: | |||
| 57793 | case X86ISD::STRICT_CVTTP2UI: | |||
| 57794 | case X86ISD::CVTTP2UI: | |||
| 57795 | return combineCVTP2I_CVTTP2I(N, DAG, DCI); | |||
| 57796 | case X86ISD::STRICT_CVTPH2PS: | |||
| 57797 | case X86ISD::CVTPH2PS: return combineCVTPH2PS(N, DAG, DCI); | |||
| 57798 | case X86ISD::BT: return combineBT(N, DAG, DCI); | |||
| 57799 | case ISD::ANY_EXTEND: | |||
| 57800 | case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget); | |||
| 57801 | case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget); | |||
| 57802 | case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget); | |||
| 57803 | case ISD::ANY_EXTEND_VECTOR_INREG: | |||
| 57804 | case ISD::SIGN_EXTEND_VECTOR_INREG: | |||
| 57805 | case ISD::ZERO_EXTEND_VECTOR_INREG: | |||
| 57806 | return combineEXTEND_VECTOR_INREG(N, DAG, DCI, Subtarget); | |||
| 57807 | case ISD::SETCC: return combineSetCC(N, DAG, DCI, Subtarget); | |||
| 57808 | case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); | |||
| 57809 | case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); | |||
| 57810 | case X86ISD::PACKSS: | |||
| 57811 | case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); | |||
| 57812 | case X86ISD::HADD: | |||
| 57813 | case X86ISD::HSUB: | |||
| 57814 | case X86ISD::FHADD: | |||
| 57815 | case X86ISD::FHSUB: return combineVectorHADDSUB(N, DAG, DCI, Subtarget); | |||
| 57816 | case X86ISD::VSHL: | |||
| 57817 | case X86ISD::VSRA: | |||
| 57818 | case X86ISD::VSRL: | |||
| 57819 | return combineVectorShiftVar(N, DAG, DCI, Subtarget); | |||
| 57820 | case X86ISD::VSHLI: | |||
| 57821 | case X86ISD::VSRAI: | |||
| 57822 | case X86ISD::VSRLI: | |||
| 57823 | return combineVectorShiftImm(N, DAG, DCI, Subtarget); | |||
| 57824 | case ISD::INSERT_VECTOR_ELT: | |||
| 57825 | case X86ISD::PINSRB: | |||
| 57826 | case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); | |||
| 57827 | case X86ISD::SHUFP: // Handle all target specific shuffles | |||
| 57828 | case X86ISD::INSERTPS: | |||
| 57829 | case X86ISD::EXTRQI: | |||
| 57830 | case X86ISD::INSERTQI: | |||
| 57831 | case X86ISD::VALIGN: | |||
| 57832 | case X86ISD::PALIGNR: | |||
| 57833 | case X86ISD::VSHLDQ: | |||
| 57834 | case X86ISD::VSRLDQ: | |||
| 57835 | case X86ISD::BLENDI: | |||
| 57836 | case X86ISD::UNPCKH: | |||
| 57837 | case X86ISD::UNPCKL: | |||
| 57838 | case X86ISD::MOVHLPS: | |||
| 57839 | case X86ISD::MOVLHPS: | |||
| 57840 | case X86ISD::PSHUFB: | |||
| 57841 | case X86ISD::PSHUFD: | |||
| 57842 | case X86ISD::PSHUFHW: | |||
| 57843 | case X86ISD::PSHUFLW: | |||
| 57844 | case X86ISD::MOVSHDUP: | |||
| 57845 | case X86ISD::MOVSLDUP: | |||
| 57846 | case X86ISD::MOVDDUP: | |||
| 57847 | case X86ISD::MOVSS: | |||
| 57848 | case X86ISD::MOVSD: | |||
| 57849 | case X86ISD::MOVSH: | |||
| 57850 | case X86ISD::VBROADCAST: | |||
| 57851 | case X86ISD::VPPERM: | |||
| 57852 | case X86ISD::VPERMI: | |||
| 57853 | case X86ISD::VPERMV: | |||
| 57854 | case X86ISD::VPERMV3: | |||
| 57855 | case X86ISD::VPERMIL2: | |||
| 57856 | case X86ISD::VPERMILPI: | |||
| 57857 | case X86ISD::VPERMILPV: | |||
| 57858 | case X86ISD::VPERM2X128: | |||
| 57859 | case X86ISD::SHUF128: | |||
| 57860 | case X86ISD::VZEXT_MOVL: | |||
| 57861 | case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); | |||
| 57862 | case X86ISD::FMADD_RND: | |||
| 57863 | case X86ISD::FMSUB: | |||
| 57864 | case X86ISD::STRICT_FMSUB: | |||
| 57865 | case X86ISD::FMSUB_RND: | |||
| 57866 | case X86ISD::FNMADD: | |||
| 57867 | case X86ISD::STRICT_FNMADD: | |||
| 57868 | case X86ISD::FNMADD_RND: | |||
| 57869 | case X86ISD::FNMSUB: | |||
| 57870 | case X86ISD::STRICT_FNMSUB: | |||
| 57871 | case X86ISD::FNMSUB_RND: | |||
| 57872 | case ISD::FMA: | |||
| 57873 | case ISD::STRICT_FMA: return combineFMA(N, DAG, DCI, Subtarget); | |||
| 57874 | case X86ISD::FMADDSUB_RND: | |||
| 57875 | case X86ISD::FMSUBADD_RND: | |||
| 57876 | case X86ISD::FMADDSUB: | |||
| 57877 | case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, DCI); | |||
| 57878 | case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI, Subtarget); | |||
| 57879 | case X86ISD::TESTP: return combineTESTP(N, DAG, DCI, Subtarget); | |||
| 57880 | case X86ISD::MGATHER: | |||
| 57881 | case X86ISD::MSCATTER: | |||
| 57882 | return combineX86GatherScatter(N, DAG, DCI, Subtarget); | |||
| 57883 | case ISD::MGATHER: | |||
| 57884 | case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI); | |||
| 57885 | case X86ISD::PCMPEQ: | |||
| 57886 | case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); | |||
| 57887 | case X86ISD::PMULDQ: | |||
| 57888 | case X86ISD::PMULUDQ: return combinePMULDQ(N, DAG, DCI, Subtarget); | |||
| 57889 | case X86ISD::VPMADDUBSW: | |||
| 57890 | case X86ISD::VPMADDWD: return combineVPMADD(N, DAG, DCI); | |||
| 57891 | case X86ISD::KSHIFTL: | |||
| 57892 | case X86ISD::KSHIFTR: return combineKSHIFT(N, DAG, DCI); | |||
| 57893 | case ISD::FP16_TO_FP: return combineFP16_TO_FP(N, DAG, Subtarget); | |||
| 57894 | case ISD::STRICT_FP_EXTEND: | |||
| 57895 | case ISD::FP_EXTEND: return combineFP_EXTEND(N, DAG, Subtarget); | |||
| 57896 | case ISD::STRICT_FP_ROUND: | |||
| 57897 | case ISD::FP_ROUND: return combineFP_ROUND(N, DAG, Subtarget); | |||
| 57898 | case X86ISD::VBROADCAST_LOAD: | |||
| 57899 | case X86ISD::SUBV_BROADCAST_LOAD: return combineBROADCAST_LOAD(N, DAG, DCI); | |||
| 57900 | case X86ISD::MOVDQ2Q: return combineMOVDQ2Q(N, DAG); | |||
| 57901 | case X86ISD::PDEP: return combinePDEP(N, DAG, DCI); | |||
| 57902 | } | |||
| 57903 | ||||
| 57904 | return SDValue(); | |||
| 57905 | } | |||
| 57906 | ||||
| 57907 | bool X86TargetLowering::preferABDSToABSWithNSW(EVT VT) const { | |||
| 57908 | return false; | |||
| 57909 | } | |||
| 57910 | ||||
| 57911 | bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const { | |||
| 57912 | if (!isTypeLegal(VT)) | |||
| 57913 | return false; | |||
| 57914 | ||||
| 57915 | // There are no vXi8 shifts. | |||
| 57916 | if (Opc == ISD::SHL && VT.isVector() && VT.getVectorElementType() == MVT::i8) | |||
| 57917 | return false; | |||
| 57918 | ||||
| 57919 | // TODO: Almost no 8-bit ops are desirable because they have no actual | |||
| 57920 | // size/speed advantages vs. 32-bit ops, but they do have a major | |||
| 57921 | // potential disadvantage by causing partial register stalls. | |||
| 57922 | // | |||
| 57923 | // 8-bit multiply/shl is probably not cheaper than 32-bit multiply/shl, and | |||
| 57924 | // we have specializations to turn 32-bit multiply/shl into LEA or other ops. | |||
| 57925 | // Also, see the comment in "IsDesirableToPromoteOp" - where we additionally | |||
| 57926 | // check for a constant operand to the multiply. | |||
| 57927 | if ((Opc == ISD::MUL || Opc == ISD::SHL) && VT == MVT::i8) | |||
| 57928 | return false; | |||
| 57929 | ||||
| 57930 | // i16 instruction encodings are longer and some i16 instructions are slow, | |||
| 57931 | // so those are not desirable. | |||
| 57932 | if (VT == MVT::i16) { | |||
| 57933 | switch (Opc) { | |||
| 57934 | default: | |||
| 57935 | break; | |||
| 57936 | case ISD::LOAD: | |||
| 57937 | case ISD::SIGN_EXTEND: | |||
| 57938 | case ISD::ZERO_EXTEND: | |||
| 57939 | case ISD::ANY_EXTEND: | |||
| 57940 | case ISD::SHL: | |||
| 57941 | case ISD::SRA: | |||
| 57942 | case ISD::SRL: | |||
| 57943 | case ISD::SUB: | |||
| 57944 | case ISD::ADD: | |||
| 57945 | case ISD::MUL: | |||
| 57946 | case ISD::AND: | |||
| 57947 | case ISD::OR: | |||
| 57948 | case ISD::XOR: | |||
| 57949 | return false; | |||
| 57950 | } | |||
| 57951 | } | |||
| 57952 | ||||
| 57953 | // Any legal type not explicitly accounted for above here is desirable. | |||
| 57954 | return true; | |||
| 57955 | } | |||
| 57956 | ||||
| 57957 | SDValue X86TargetLowering::expandIndirectJTBranch(const SDLoc& dl, | |||
| 57958 | SDValue Value, SDValue Addr, | |||
| 57959 | SelectionDAG &DAG) const { | |||
| 57960 | const Module *M = DAG.getMachineFunction().getMMI().getModule(); | |||
| 57961 | Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch"); | |||
| 57962 | if (IsCFProtectionSupported) { | |||
| 57963 | // In case control-flow branch protection is enabled, we need to add | |||
| 57964 | // notrack prefix to the indirect branch. | |||
| 57965 | // In order to do that we create NT_BRIND SDNode. | |||
| 57966 | // Upon ISEL, the pattern will convert it to jmp with NoTrack prefix. | |||
| 57967 | return DAG.getNode(X86ISD::NT_BRIND, dl, MVT::Other, Value, Addr); | |||
| 57968 | } | |||
| 57969 | ||||
| 57970 | return TargetLowering::expandIndirectJTBranch(dl, Value, Addr, DAG); | |||
| 57971 | } | |||
| 57972 | ||||
| 57973 | TargetLowering::AndOrSETCCFoldKind | |||
| 57974 | X86TargetLowering::isDesirableToCombineLogicOpOfSETCC( | |||
| 57975 | const SDNode *LogicOp, const SDNode *SETCC0, const SDNode *SETCC1) const { | |||
| 57976 | using AndOrSETCCFoldKind = TargetLowering::AndOrSETCCFoldKind; | |||
| 57977 | EVT VT = LogicOp->getValueType(0); | |||
| 57978 | EVT OpVT = SETCC0->getOperand(0).getValueType(); | |||
| 57979 | if (!VT.isInteger()) | |||
| 57980 | return AndOrSETCCFoldKind::None; | |||
| 57981 | ||||
| 57982 | if (VT.isVector()) | |||
| 57983 | return AndOrSETCCFoldKind(AndOrSETCCFoldKind::NotAnd | | |||
| 57984 | (isOperationLegal(ISD::ABS, OpVT) | |||
| 57985 | ? AndOrSETCCFoldKind::ABS | |||
| 57986 | : AndOrSETCCFoldKind::None)); | |||
| 57987 | ||||
| 57988 | // Don't use `NotAnd` as even though `not` is generally shorter code size than | |||
| 57989 | // `add`, `add` can lower to LEA which can save moves / spills. Any case where | |||
| 57990 | // `NotAnd` applies, `AddAnd` does as well. | |||
| 57991 | // TODO: Currently we lower (icmp eq/ne (and ~X, Y), 0) -> `test (not X), Y`, | |||
| 57992 | // if we change that to `andn Y, X` it may be worth prefering `NotAnd` here. | |||
| 57993 | return AndOrSETCCFoldKind::AddAnd; | |||
| 57994 | } | |||
| 57995 | ||||
| 57996 | bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { | |||
| 57997 | EVT VT = Op.getValueType(); | |||
| 57998 | bool Is8BitMulByConstant = VT == MVT::i8 && Op.getOpcode() == ISD::MUL && | |||
| 57999 | isa<ConstantSDNode>(Op.getOperand(1)); | |||
| 58000 | ||||
| 58001 | // i16 is legal, but undesirable since i16 instruction encodings are longer | |||
| 58002 | // and some i16 instructions are slow. | |||
| 58003 | // 8-bit multiply-by-constant can usually be expanded to something cheaper | |||
| 58004 | // using LEA and/or other ALU ops. | |||
| 58005 | if (VT != MVT::i16 && !Is8BitMulByConstant) | |||
| 58006 | return false; | |||
| 58007 | ||||
| 58008 | auto IsFoldableRMW = [](SDValue Load, SDValue Op) { | |||
| 58009 | if (!Op.hasOneUse()) | |||
| 58010 | return false; | |||
| 58011 | SDNode *User = *Op->use_begin(); | |||
| 58012 | if (!ISD::isNormalStore(User)) | |||
| 58013 | return false; | |||
| 58014 | auto *Ld = cast<LoadSDNode>(Load); | |||
| 58015 | auto *St = cast<StoreSDNode>(User); | |||
| 58016 | return Ld->getBasePtr() == St->getBasePtr(); | |||
| 58017 | }; | |||
| 58018 | ||||
| 58019 | auto IsFoldableAtomicRMW = [](SDValue Load, SDValue Op) { | |||
| 58020 | if (!Load.hasOneUse() || Load.getOpcode() != ISD::ATOMIC_LOAD) | |||
| 58021 | return false; | |||
| 58022 | if (!Op.hasOneUse()) | |||
| 58023 | return false; | |||
| 58024 | SDNode *User = *Op->use_begin(); | |||
| 58025 | if (User->getOpcode() != ISD::ATOMIC_STORE) | |||
| 58026 | return false; | |||
| 58027 | auto *Ld = cast<AtomicSDNode>(Load); | |||
| 58028 | auto *St = cast<AtomicSDNode>(User); | |||
| 58029 | return Ld->getBasePtr() == St->getBasePtr(); | |||
| 58030 | }; | |||
| 58031 | ||||
| 58032 | bool Commute = false; | |||
| 58033 | switch (Op.getOpcode()) { | |||
| 58034 | default: return false; | |||
| 58035 | case ISD::SIGN_EXTEND: | |||
| 58036 | case ISD::ZERO_EXTEND: | |||
| 58037 | case ISD::ANY_EXTEND: | |||
| 58038 | break; | |||
| 58039 | case ISD::SHL: | |||
| 58040 | case ISD::SRA: | |||
| 58041 | case ISD::SRL: { | |||
| 58042 | SDValue N0 = Op.getOperand(0); | |||
| 58043 | // Look out for (store (shl (load), x)). | |||
| 58044 | if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op)) | |||
| 58045 | return false; | |||
| 58046 | break; | |||
| 58047 | } | |||
| 58048 | case ISD::ADD: | |||
| 58049 | case ISD::MUL: | |||
| 58050 | case ISD::AND: | |||
| 58051 | case ISD::OR: | |||
| 58052 | case ISD::XOR: | |||
| 58053 | Commute = true; | |||
| 58054 | [[fallthrough]]; | |||
| 58055 | case ISD::SUB: { | |||
| 58056 | SDValue N0 = Op.getOperand(0); | |||
| 58057 | SDValue N1 = Op.getOperand(1); | |||
| 58058 | // Avoid disabling potential load folding opportunities. | |||
| 58059 | if (X86::mayFoldLoad(N1, Subtarget) && | |||
| 58060 | (!Commute || !isa<ConstantSDNode>(N0) || | |||
| 58061 | (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) | |||
| 58062 | return false; | |||
| 58063 | if (X86::mayFoldLoad(N0, Subtarget) && | |||
| 58064 | ((Commute && !isa<ConstantSDNode>(N1)) || | |||
| 58065 | (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) | |||
| 58066 | return false; | |||
| 58067 | if (IsFoldableAtomicRMW(N0, Op) || | |||
| 58068 | (Commute && IsFoldableAtomicRMW(N1, Op))) | |||
| 58069 | return false; | |||
| 58070 | } | |||
| 58071 | } | |||
| 58072 | ||||
| 58073 | PVT = MVT::i32; | |||
| 58074 | return true; | |||
| 58075 | } | |||
| 58076 | ||||
| 58077 | //===----------------------------------------------------------------------===// | |||
| 58078 | // X86 Inline Assembly Support | |||
| 58079 | //===----------------------------------------------------------------------===// | |||
| 58080 | ||||
| 58081 | // Helper to match a string separated by whitespace. | |||
| 58082 | static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) { | |||
| 58083 | S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace. | |||
| 58084 | ||||
| 58085 | for (StringRef Piece : Pieces) { | |||
| 58086 | if (!S.startswith(Piece)) // Check if the piece matches. | |||
| 58087 | return false; | |||
| 58088 | ||||
| 58089 | S = S.substr(Piece.size()); | |||
| 58090 | StringRef::size_type Pos = S.find_first_not_of(" \t"); | |||
| 58091 | if (Pos == 0) // We matched a prefix. | |||
| 58092 | return false; | |||
| 58093 | ||||
| 58094 | S = S.substr(Pos); | |||
| 58095 | } | |||
| 58096 | ||||
| 58097 | return S.empty(); | |||
| 58098 | } | |||
| 58099 | ||||
| 58100 | static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) { | |||
| 58101 | ||||
| 58102 | if (AsmPieces.size() == 3 || AsmPieces.size() == 4) { | |||
| 58103 | if (llvm::is_contained(AsmPieces, "~{cc}") && | |||
| 58104 | llvm::is_contained(AsmPieces, "~{flags}") && | |||
| 58105 | llvm::is_contained(AsmPieces, "~{fpsr}")) { | |||
| 58106 | ||||
| 58107 | if (AsmPieces.size() == 3) | |||
| 58108 | return true; | |||
| 58109 | else if (llvm::is_contained(AsmPieces, "~{dirflag}")) | |||
| 58110 | return true; | |||
| 58111 | } | |||
| 58112 | } | |||
| 58113 | return false; | |||
| 58114 | } | |||
| 58115 | ||||
| 58116 | bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const { | |||
| 58117 | InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand()); | |||
| 58118 | ||||
| 58119 | const std::string &AsmStr = IA->getAsmString(); | |||
| 58120 | ||||
| 58121 | IntegerType *Ty = dyn_cast<IntegerType>(CI->getType()); | |||
| 58122 | if (!Ty || Ty->getBitWidth() % 16 != 0) | |||
| 58123 | return false; | |||
| 58124 | ||||
| 58125 | // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a" | |||
| 58126 | SmallVector<StringRef, 4> AsmPieces; | |||
| 58127 | SplitString(AsmStr, AsmPieces, ";\n"); | |||
| 58128 | ||||
| 58129 | switch (AsmPieces.size()) { | |||
| 58130 | default: return false; | |||
| 58131 | case 1: | |||
| 58132 | // FIXME: this should verify that we are targeting a 486 or better. If not, | |||
| 58133 | // we will turn this bswap into something that will be lowered to logical | |||
| 58134 | // ops instead of emitting the bswap asm. For now, we don't support 486 or | |||
| 58135 | // lower so don't worry about this. | |||
| 58136 | // bswap $0 | |||
| 58137 | if (matchAsm(AsmPieces[0], {"bswap", "$0"}) || | |||
| 58138 | matchAsm(AsmPieces[0], {"bswapl", "$0"}) || | |||
| 58139 | matchAsm(AsmPieces[0], {"bswapq", "$0"}) || | |||
| 58140 | matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) || | |||
| 58141 | matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) || | |||
| 58142 | matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) { | |||
| 58143 | // No need to check constraints, nothing other than the equivalent of | |||
| 58144 | // "=r,0" would be valid here. | |||
| 58145 | return IntrinsicLowering::LowerToByteSwap(CI); | |||
| 58146 | } | |||
| 58147 | ||||
| 58148 | // rorw $$8, ${0:w} --> llvm.bswap.i16 | |||
| 58149 | if (CI->getType()->isIntegerTy(16) && | |||
| 58150 | IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && | |||
| 58151 | (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) || | |||
| 58152 | matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) { | |||
| 58153 | AsmPieces.clear(); | |||
| 58154 | StringRef ConstraintsStr = IA->getConstraintString(); | |||
| 58155 | SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); | |||
| 58156 | array_pod_sort(AsmPieces.begin(), AsmPieces.end()); | |||
| 58157 | if (clobbersFlagRegisters(AsmPieces)) | |||
| 58158 | return IntrinsicLowering::LowerToByteSwap(CI); | |||
| 58159 | } | |||
| 58160 | break; | |||
| 58161 | case 3: | |||
| 58162 | if (CI->getType()->isIntegerTy(32) && | |||
| 58163 | IA->getConstraintString().compare(0, 5, "=r,0,") == 0 && | |||
| 58164 | matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) && | |||
| 58165 | matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) && | |||
| 58166 | matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) { | |||
| 58167 | AsmPieces.clear(); | |||
| 58168 | StringRef ConstraintsStr = IA->getConstraintString(); | |||
| 58169 | SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ","); | |||
| 58170 | array_pod_sort(AsmPieces.begin(), AsmPieces.end()); | |||
| 58171 | if (clobbersFlagRegisters(AsmPieces)) | |||
| 58172 | return IntrinsicLowering::LowerToByteSwap(CI); | |||
| 58173 | } | |||
| 58174 | ||||
| 58175 | if (CI->getType()->isIntegerTy(64)) { | |||
| 58176 | InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints(); | |||
| 58177 | if (Constraints.size() >= 2 && | |||
| 58178 | Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" && | |||
| 58179 | Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") { | |||
| 58180 | // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64 | |||
| 58181 | if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) && | |||
| 58182 | matchAsm(AsmPieces[1], {"bswap", "%edx"}) && | |||
| 58183 | matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"})) | |||
| 58184 | return IntrinsicLowering::LowerToByteSwap(CI); | |||
| 58185 | } | |||
| 58186 | } | |||
| 58187 | break; | |||
| 58188 | } | |||
| 58189 | return false; | |||
| 58190 | } | |||
| 58191 | ||||
| 58192 | static X86::CondCode parseConstraintCode(llvm::StringRef Constraint) { | |||
| 58193 | X86::CondCode Cond = StringSwitch<X86::CondCode>(Constraint) | |||
| 58194 | .Case("{@cca}", X86::COND_A) | |||
| 58195 | .Case("{@ccae}", X86::COND_AE) | |||
| 58196 | .Case("{@ccb}", X86::COND_B) | |||
| 58197 | .Case("{@ccbe}", X86::COND_BE) | |||
| 58198 | .Case("{@ccc}", X86::COND_B) | |||
| 58199 | .Case("{@cce}", X86::COND_E) | |||
| 58200 | .Case("{@ccz}", X86::COND_E) | |||
| 58201 | .Case("{@ccg}", X86::COND_G) | |||
| 58202 | .Case("{@ccge}", X86::COND_GE) | |||
| 58203 | .Case("{@ccl}", X86::COND_L) | |||
| 58204 | .Case("{@ccle}", X86::COND_LE) | |||
| 58205 | .Case("{@ccna}", X86::COND_BE) | |||
| 58206 | .Case("{@ccnae}", X86::COND_B) | |||
| 58207 | .Case("{@ccnb}", X86::COND_AE) | |||
| 58208 | .Case("{@ccnbe}", X86::COND_A) | |||
| 58209 | .Case("{@ccnc}", X86::COND_AE) | |||
| 58210 | .Case("{@ccne}", X86::COND_NE) | |||
| 58211 | .Case("{@ccnz}", X86::COND_NE) | |||
| 58212 | .Case("{@ccng}", X86::COND_LE) | |||
| 58213 | .Case("{@ccnge}", X86::COND_L) | |||
| 58214 | .Case("{@ccnl}", X86::COND_GE) | |||
| 58215 | .Case("{@ccnle}", X86::COND_G) | |||
| 58216 | .Case("{@ccno}", X86::COND_NO) | |||
| 58217 | .Case("{@ccnp}", X86::COND_NP) | |||
| 58218 | .Case("{@ccns}", X86::COND_NS) | |||
| 58219 | .Case("{@cco}", X86::COND_O) | |||
| 58220 | .Case("{@ccp}", X86::COND_P) | |||
| 58221 | .Case("{@ccs}", X86::COND_S) | |||
| 58222 | .Default(X86::COND_INVALID); | |||
| 58223 | return Cond; | |||
| 58224 | } | |||
| 58225 | ||||
| 58226 | /// Given a constraint letter, return the type of constraint for this target. | |||
| 58227 | X86TargetLowering::ConstraintType | |||
| 58228 | X86TargetLowering::getConstraintType(StringRef Constraint) const { | |||
| 58229 | if (Constraint.size() == 1) { | |||
| 58230 | switch (Constraint[0]) { | |||
| 58231 | case 'R': | |||
| 58232 | case 'q': | |||
| 58233 | case 'Q': | |||
| 58234 | case 'f': | |||
| 58235 | case 't': | |||
| 58236 | case 'u': | |||
| 58237 | case 'y': | |||
| 58238 | case 'x': | |||
| 58239 | case 'v': | |||
| 58240 | case 'l': | |||
| 58241 | case 'k': // AVX512 masking registers. | |||
| 58242 | return C_RegisterClass; | |||
| 58243 | case 'a': | |||
| 58244 | case 'b': | |||
| 58245 | case 'c': | |||
| 58246 | case 'd': | |||
| 58247 | case 'S': | |||
| 58248 | case 'D': | |||
| 58249 | case 'A': | |||
| 58250 | return C_Register; | |||
| 58251 | case 'I': | |||
| 58252 | case 'J': | |||
| 58253 | case 'K': | |||
| 58254 | case 'N': | |||
| 58255 | case 'G': | |||
| 58256 | case 'L': | |||
| 58257 | case 'M': | |||
| 58258 | return C_Immediate; | |||
| 58259 | case 'C': | |||
| 58260 | case 'e': | |||
| 58261 | case 'Z': | |||
| 58262 | return C_Other; | |||
| 58263 | default: | |||
| 58264 | break; | |||
| 58265 | } | |||
| 58266 | } | |||
| 58267 | else if (Constraint.size() == 2) { | |||
| 58268 | switch (Constraint[0]) { | |||
| 58269 | default: | |||
| 58270 | break; | |||
| 58271 | case 'Y': | |||
| 58272 | switch (Constraint[1]) { | |||
| 58273 | default: | |||
| 58274 | break; | |||
| 58275 | case 'z': | |||
| 58276 | return C_Register; | |||
| 58277 | case 'i': | |||
| 58278 | case 'm': | |||
| 58279 | case 'k': | |||
| 58280 | case 't': | |||
| 58281 | case '2': | |||
| 58282 | return C_RegisterClass; | |||
| 58283 | } | |||
| 58284 | } | |||
| 58285 | } else if (parseConstraintCode(Constraint) != X86::COND_INVALID) | |||
| 58286 | return C_Other; | |||
| 58287 | return TargetLowering::getConstraintType(Constraint); | |||
| 58288 | } | |||
| 58289 | ||||
| 58290 | /// Examine constraint type and operand type and determine a weight value. | |||
| 58291 | /// This object must already have been set up with the operand type | |||
| 58292 | /// and the current alternative constraint selected. | |||
| 58293 | TargetLowering::ConstraintWeight | |||
| 58294 | X86TargetLowering::getSingleConstraintMatchWeight( | |||
| 58295 | AsmOperandInfo &info, const char *constraint) const { | |||
| 58296 | ConstraintWeight weight = CW_Invalid; | |||
| 58297 | Value *CallOperandVal = info.CallOperandVal; | |||
| 58298 | // If we don't have a value, we can't do a match, | |||
| 58299 | // but allow it at the lowest weight. | |||
| 58300 | if (!CallOperandVal) | |||
| 58301 | return CW_Default; | |||
| 58302 | Type *type = CallOperandVal->getType(); | |||
| 58303 | // Look at the constraint type. | |||
| 58304 | switch (*constraint) { | |||
| 58305 | default: | |||
| 58306 | weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint); | |||
| 58307 | [[fallthrough]]; | |||
| 58308 | case 'R': | |||
| 58309 | case 'q': | |||
| 58310 | case 'Q': | |||
| 58311 | case 'a': | |||
| 58312 | case 'b': | |||
| 58313 | case 'c': | |||
| 58314 | case 'd': | |||
| 58315 | case 'S': | |||
| 58316 | case 'D': | |||
| 58317 | case 'A': | |||
| 58318 | if (CallOperandVal->getType()->isIntegerTy()) | |||
| 58319 | weight = CW_SpecificReg; | |||
| 58320 | break; | |||
| 58321 | case 'f': | |||
| 58322 | case 't': | |||
| 58323 | case 'u': | |||
| 58324 | if (type->isFloatingPointTy()) | |||
| 58325 | weight = CW_SpecificReg; | |||
| 58326 | break; | |||
| 58327 | case 'y': | |||
| 58328 | if (type->isX86_MMXTy() && Subtarget.hasMMX()) | |||
| 58329 | weight = CW_SpecificReg; | |||
| 58330 | break; | |||
| 58331 | case 'Y': | |||
| 58332 | if (StringRef(constraint).size() != 2) | |||
| 58333 | break; | |||
| 58334 | switch (constraint[1]) { | |||
| 58335 | default: | |||
| 58336 | return CW_Invalid; | |||
| 58337 | // XMM0 | |||
| 58338 | case 'z': | |||
| 58339 | if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || | |||
| 58340 | ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) || | |||
| 58341 | ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())) | |||
| 58342 | return CW_SpecificReg; | |||
| 58343 | return CW_Invalid; | |||
| 58344 | // Conditional OpMask regs (AVX512) | |||
| 58345 | case 'k': | |||
| 58346 | if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) | |||
| 58347 | return CW_Register; | |||
| 58348 | return CW_Invalid; | |||
| 58349 | // Any MMX reg | |||
| 58350 | case 'm': | |||
| 58351 | if (type->isX86_MMXTy() && Subtarget.hasMMX()) | |||
| 58352 | return weight; | |||
| 58353 | return CW_Invalid; | |||
| 58354 | // Any SSE reg when ISA >= SSE2, same as 'x' | |||
| 58355 | case 'i': | |||
| 58356 | case 't': | |||
| 58357 | case '2': | |||
| 58358 | if (!Subtarget.hasSSE2()) | |||
| 58359 | return CW_Invalid; | |||
| 58360 | break; | |||
| 58361 | } | |||
| 58362 | break; | |||
| 58363 | case 'v': | |||
| 58364 | if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) | |||
| 58365 | weight = CW_Register; | |||
| 58366 | [[fallthrough]]; | |||
| 58367 | case 'x': | |||
| 58368 | if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) || | |||
| 58369 | ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX())) | |||
| 58370 | weight = CW_Register; | |||
| 58371 | break; | |||
| 58372 | case 'k': | |||
| 58373 | // Enable conditional vector operations using %k<#> registers. | |||
| 58374 | if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) | |||
| 58375 | weight = CW_Register; | |||
| 58376 | break; | |||
| 58377 | case 'I': | |||
| 58378 | if (auto *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { | |||
| 58379 | if (C->getZExtValue() <= 31) | |||
| 58380 | weight = CW_Constant; | |||
| 58381 | } | |||
| 58382 | break; | |||
| 58383 | case 'J': | |||
| 58384 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58385 | if (C->getZExtValue() <= 63) | |||
| 58386 | weight = CW_Constant; | |||
| 58387 | } | |||
| 58388 | break; | |||
| 58389 | case 'K': | |||
| 58390 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58391 | if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f)) | |||
| 58392 | weight = CW_Constant; | |||
| 58393 | } | |||
| 58394 | break; | |||
| 58395 | case 'L': | |||
| 58396 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58397 | if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff)) | |||
| 58398 | weight = CW_Constant; | |||
| 58399 | } | |||
| 58400 | break; | |||
| 58401 | case 'M': | |||
| 58402 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58403 | if (C->getZExtValue() <= 3) | |||
| 58404 | weight = CW_Constant; | |||
| 58405 | } | |||
| 58406 | break; | |||
| 58407 | case 'N': | |||
| 58408 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58409 | if (C->getZExtValue() <= 0xff) | |||
| 58410 | weight = CW_Constant; | |||
| 58411 | } | |||
| 58412 | break; | |||
| 58413 | case 'G': | |||
| 58414 | case 'C': | |||
| 58415 | if (isa<ConstantFP>(CallOperandVal)) { | |||
| 58416 | weight = CW_Constant; | |||
| 58417 | } | |||
| 58418 | break; | |||
| 58419 | case 'e': | |||
| 58420 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58421 | if ((C->getSExtValue() >= -0x80000000LL) && | |||
| 58422 | (C->getSExtValue() <= 0x7fffffffLL)) | |||
| 58423 | weight = CW_Constant; | |||
| 58424 | } | |||
| 58425 | break; | |||
| 58426 | case 'Z': | |||
| 58427 | if (auto *C = dyn_cast<ConstantInt>(CallOperandVal)) { | |||
| 58428 | if (C->getZExtValue() <= 0xffffffff) | |||
| 58429 | weight = CW_Constant; | |||
| 58430 | } | |||
| 58431 | break; | |||
| 58432 | } | |||
| 58433 | return weight; | |||
| 58434 | } | |||
| 58435 | ||||
| 58436 | /// Try to replace an X constraint, which matches anything, with another that | |||
| 58437 | /// has more specific requirements based on the type of the corresponding | |||
| 58438 | /// operand. | |||
| 58439 | const char *X86TargetLowering:: | |||
| 58440 | LowerXConstraint(EVT ConstraintVT) const { | |||
| 58441 | // FP X constraints get lowered to SSE1/2 registers if available, otherwise | |||
| 58442 | // 'f' like normal targets. | |||
| 58443 | if (ConstraintVT.isFloatingPoint()) { | |||
| 58444 | if (Subtarget.hasSSE1()) | |||
| 58445 | return "x"; | |||
| 58446 | } | |||
| 58447 | ||||
| 58448 | return TargetLowering::LowerXConstraint(ConstraintVT); | |||
| 58449 | } | |||
| 58450 | ||||
| 58451 | // Lower @cc targets via setcc. | |||
| 58452 | SDValue X86TargetLowering::LowerAsmOutputForConstraint( | |||
| 58453 | SDValue &Chain, SDValue &Glue, const SDLoc &DL, | |||
| 58454 | const AsmOperandInfo &OpInfo, SelectionDAG &DAG) const { | |||
| 58455 | X86::CondCode Cond = parseConstraintCode(OpInfo.ConstraintCode); | |||
| 58456 | if (Cond == X86::COND_INVALID) | |||
| 58457 | return SDValue(); | |||
| 58458 | // Check that return type is valid. | |||
| 58459 | if (OpInfo.ConstraintVT.isVector() || !OpInfo.ConstraintVT.isInteger() || | |||
| 58460 | OpInfo.ConstraintVT.getSizeInBits() < 8) | |||
| 58461 | report_fatal_error("Glue output operand is of invalid type"); | |||
| 58462 | ||||
| 58463 | // Get EFLAGS register. Only update chain when copyfrom is glued. | |||
| 58464 | if (Glue.getNode()) { | |||
| 58465 | Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32, Glue); | |||
| 58466 | Chain = Glue.getValue(1); | |||
| 58467 | } else | |||
| 58468 | Glue = DAG.getCopyFromReg(Chain, DL, X86::EFLAGS, MVT::i32); | |||
| 58469 | // Extract CC code. | |||
| 58470 | SDValue CC = getSETCC(Cond, Glue, DL, DAG); | |||
| 58471 | // Extend to 32-bits | |||
| 58472 | SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, DL, OpInfo.ConstraintVT, CC); | |||
| 58473 | ||||
| 58474 | return Result; | |||
| 58475 | } | |||
| 58476 | ||||
| 58477 | /// Lower the specified operand into the Ops vector. | |||
| 58478 | /// If it is invalid, don't add anything to Ops. | |||
| 58479 | void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op, | |||
| 58480 | std::string &Constraint, | |||
| 58481 | std::vector<SDValue>&Ops, | |||
| 58482 | SelectionDAG &DAG) const { | |||
| 58483 | SDValue Result; | |||
| 58484 | ||||
| 58485 | // Only support length 1 constraints for now. | |||
| 58486 | if (Constraint.length() > 1) return; | |||
| 58487 | ||||
| 58488 | char ConstraintLetter = Constraint[0]; | |||
| 58489 | switch (ConstraintLetter) { | |||
| 58490 | default: break; | |||
| 58491 | case 'I': | |||
| 58492 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58493 | if (C->getZExtValue() <= 31) { | |||
| 58494 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58495 | Op.getValueType()); | |||
| 58496 | break; | |||
| 58497 | } | |||
| 58498 | } | |||
| 58499 | return; | |||
| 58500 | case 'J': | |||
| 58501 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58502 | if (C->getZExtValue() <= 63) { | |||
| 58503 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58504 | Op.getValueType()); | |||
| 58505 | break; | |||
| 58506 | } | |||
| 58507 | } | |||
| 58508 | return; | |||
| 58509 | case 'K': | |||
| 58510 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58511 | if (isInt<8>(C->getSExtValue())) { | |||
| 58512 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58513 | Op.getValueType()); | |||
| 58514 | break; | |||
| 58515 | } | |||
| 58516 | } | |||
| 58517 | return; | |||
| 58518 | case 'L': | |||
| 58519 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58520 | if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff || | |||
| 58521 | (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) { | |||
| 58522 | Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), | |||
| 58523 | Op.getValueType()); | |||
| 58524 | break; | |||
| 58525 | } | |||
| 58526 | } | |||
| 58527 | return; | |||
| 58528 | case 'M': | |||
| 58529 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58530 | if (C->getZExtValue() <= 3) { | |||
| 58531 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58532 | Op.getValueType()); | |||
| 58533 | break; | |||
| 58534 | } | |||
| 58535 | } | |||
| 58536 | return; | |||
| 58537 | case 'N': | |||
| 58538 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58539 | if (C->getZExtValue() <= 255) { | |||
| 58540 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58541 | Op.getValueType()); | |||
| 58542 | break; | |||
| 58543 | } | |||
| 58544 | } | |||
| 58545 | return; | |||
| 58546 | case 'O': | |||
| 58547 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58548 | if (C->getZExtValue() <= 127) { | |||
| 58549 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58550 | Op.getValueType()); | |||
| 58551 | break; | |||
| 58552 | } | |||
| 58553 | } | |||
| 58554 | return; | |||
| 58555 | case 'e': { | |||
| 58556 | // 32-bit signed value | |||
| 58557 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58558 | if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), | |||
| 58559 | C->getSExtValue())) { | |||
| 58560 | // Widen to 64 bits here to get it sign extended. | |||
| 58561 | Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64); | |||
| 58562 | break; | |||
| 58563 | } | |||
| 58564 | // FIXME gcc accepts some relocatable values here too, but only in certain | |||
| 58565 | // memory models; it's complicated. | |||
| 58566 | } | |||
| 58567 | return; | |||
| 58568 | } | |||
| 58569 | case 'Z': { | |||
| 58570 | // 32-bit unsigned value | |||
| 58571 | if (auto *C = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58572 | if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()), | |||
| 58573 | C->getZExtValue())) { | |||
| 58574 | Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op), | |||
| 58575 | Op.getValueType()); | |||
| 58576 | break; | |||
| 58577 | } | |||
| 58578 | } | |||
| 58579 | // FIXME gcc accepts some relocatable values here too, but only in certain | |||
| 58580 | // memory models; it's complicated. | |||
| 58581 | return; | |||
| 58582 | } | |||
| 58583 | case 'i': { | |||
| 58584 | // Literal immediates are always ok. | |||
| 58585 | if (auto *CST = dyn_cast<ConstantSDNode>(Op)) { | |||
| 58586 | bool IsBool = CST->getConstantIntValue()->getBitWidth() == 1; | |||
| 58587 | BooleanContent BCont = getBooleanContents(MVT::i64); | |||
| 58588 | ISD::NodeType ExtOpc = IsBool ? getExtendForContent(BCont) | |||
| 58589 | : ISD::SIGN_EXTEND; | |||
| 58590 | int64_t ExtVal = ExtOpc == ISD::ZERO_EXTEND ? CST->getZExtValue() | |||
| 58591 | : CST->getSExtValue(); | |||
| 58592 | Result = DAG.getTargetConstant(ExtVal, SDLoc(Op), MVT::i64); | |||
| 58593 | break; | |||
| 58594 | } | |||
| 58595 | ||||
| 58596 | // In any sort of PIC mode addresses need to be computed at runtime by | |||
| 58597 | // adding in a register or some sort of table lookup. These can't | |||
| 58598 | // be used as immediates. BlockAddresses and BasicBlocks are fine though. | |||
| 58599 | if ((Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC()) && | |||
| 58600 | !(isa<BlockAddressSDNode>(Op) || isa<BasicBlockSDNode>(Op))) | |||
| 58601 | return; | |||
| 58602 | ||||
| 58603 | // If we are in non-pic codegen mode, we allow the address of a global (with | |||
| 58604 | // an optional displacement) to be used with 'i'. | |||
| 58605 | if (auto *GA = dyn_cast<GlobalAddressSDNode>(Op)) | |||
| 58606 | // If we require an extra load to get this address, as in PIC mode, we | |||
| 58607 | // can't accept it. | |||
| 58608 | if (isGlobalStubReference( | |||
| 58609 | Subtarget.classifyGlobalReference(GA->getGlobal()))) | |||
| 58610 | return; | |||
| 58611 | break; | |||
| 58612 | } | |||
| 58613 | } | |||
| 58614 | ||||
| 58615 | if (Result.getNode()) { | |||
| 58616 | Ops.push_back(Result); | |||
| 58617 | return; | |||
| 58618 | } | |||
| 58619 | return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG); | |||
| 58620 | } | |||
| 58621 | ||||
| 58622 | /// Check if \p RC is a general purpose register class. | |||
| 58623 | /// I.e., GR* or one of their variant. | |||
| 58624 | static bool isGRClass(const TargetRegisterClass &RC) { | |||
| 58625 | return RC.hasSuperClassEq(&X86::GR8RegClass) || | |||
| 58626 | RC.hasSuperClassEq(&X86::GR16RegClass) || | |||
| 58627 | RC.hasSuperClassEq(&X86::GR32RegClass) || | |||
| 58628 | RC.hasSuperClassEq(&X86::GR64RegClass) || | |||
| 58629 | RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass); | |||
| 58630 | } | |||
| 58631 | ||||
| 58632 | /// Check if \p RC is a vector register class. | |||
| 58633 | /// I.e., FR* / VR* or one of their variant. | |||
| 58634 | static bool isFRClass(const TargetRegisterClass &RC) { | |||
| 58635 | return RC.hasSuperClassEq(&X86::FR16XRegClass) || | |||
| 58636 | RC.hasSuperClassEq(&X86::FR32XRegClass) || | |||
| 58637 | RC.hasSuperClassEq(&X86::FR64XRegClass) || | |||
| 58638 | RC.hasSuperClassEq(&X86::VR128XRegClass) || | |||
| 58639 | RC.hasSuperClassEq(&X86::VR256XRegClass) || | |||
| 58640 | RC.hasSuperClassEq(&X86::VR512RegClass); | |||
| 58641 | } | |||
| 58642 | ||||
| 58643 | /// Check if \p RC is a mask register class. | |||
| 58644 | /// I.e., VK* or one of their variant. | |||
| 58645 | static bool isVKClass(const TargetRegisterClass &RC) { | |||
| 58646 | return RC.hasSuperClassEq(&X86::VK1RegClass) || | |||
| 58647 | RC.hasSuperClassEq(&X86::VK2RegClass) || | |||
| 58648 | RC.hasSuperClassEq(&X86::VK4RegClass) || | |||
| 58649 | RC.hasSuperClassEq(&X86::VK8RegClass) || | |||
| 58650 | RC.hasSuperClassEq(&X86::VK16RegClass) || | |||
| 58651 | RC.hasSuperClassEq(&X86::VK32RegClass) || | |||
| 58652 | RC.hasSuperClassEq(&X86::VK64RegClass); | |||
| 58653 | } | |||
| 58654 | ||||
| 58655 | std::pair<unsigned, const TargetRegisterClass *> | |||
| 58656 | X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, | |||
| 58657 | StringRef Constraint, | |||
| 58658 | MVT VT) const { | |||
| 58659 | // First, see if this is a constraint that directly corresponds to an LLVM | |||
| 58660 | // register class. | |||
| 58661 | if (Constraint.size() == 1) { | |||
| 58662 | // GCC Constraint Letters | |||
| 58663 | switch (Constraint[0]) { | |||
| 58664 | default: break; | |||
| 58665 | // 'A' means [ER]AX + [ER]DX. | |||
| 58666 | case 'A': | |||
| 58667 | if (Subtarget.is64Bit()) | |||
| 58668 | return std::make_pair(X86::RAX, &X86::GR64_ADRegClass); | |||
| 58669 | assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&(static_cast <bool> ((Subtarget.is32Bit() || Subtarget. is16Bit()) && "Expecting 64, 32 or 16 bit subtarget") ? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__ __PRETTY_FUNCTION__)) | |||
| 58670 | "Expecting 64, 32 or 16 bit subtarget")(static_cast <bool> ((Subtarget.is32Bit() || Subtarget. is16Bit()) && "Expecting 64, 32 or 16 bit subtarget") ? void (0) : __assert_fail ("(Subtarget.is32Bit() || Subtarget.is16Bit()) && \"Expecting 64, 32 or 16 bit subtarget\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 58670, __extension__ __PRETTY_FUNCTION__)); | |||
| 58671 | return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); | |||
| 58672 | ||||
| 58673 | // TODO: Slight differences here in allocation order and leaving | |||
| 58674 | // RIP in the class. Do they matter any more here than they do | |||
| 58675 | // in the normal allocation? | |||
| 58676 | case 'k': | |||
| 58677 | if (Subtarget.hasAVX512()) { | |||
| 58678 | if (VT == MVT::i1) | |||
| 58679 | return std::make_pair(0U, &X86::VK1RegClass); | |||
| 58680 | if (VT == MVT::i8) | |||
| 58681 | return std::make_pair(0U, &X86::VK8RegClass); | |||
| 58682 | if (VT == MVT::i16) | |||
| 58683 | return std::make_pair(0U, &X86::VK16RegClass); | |||
| 58684 | } | |||
| 58685 | if (Subtarget.hasBWI()) { | |||
| 58686 | if (VT == MVT::i32) | |||
| 58687 | return std::make_pair(0U, &X86::VK32RegClass); | |||
| 58688 | if (VT == MVT::i64) | |||
| 58689 | return std::make_pair(0U, &X86::VK64RegClass); | |||
| 58690 | } | |||
| 58691 | break; | |||
| 58692 | case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode. | |||
| 58693 | if (Subtarget.is64Bit()) { | |||
| 58694 | if (VT == MVT::i8 || VT == MVT::i1) | |||
| 58695 | return std::make_pair(0U, &X86::GR8RegClass); | |||
| 58696 | if (VT == MVT::i16) | |||
| 58697 | return std::make_pair(0U, &X86::GR16RegClass); | |||
| 58698 | if (VT == MVT::i32 || VT == MVT::f32) | |||
| 58699 | return std::make_pair(0U, &X86::GR32RegClass); | |||
| 58700 | if (VT != MVT::f80 && !VT.isVector()) | |||
| 58701 | return std::make_pair(0U, &X86::GR64RegClass); | |||
| 58702 | break; | |||
| 58703 | } | |||
| 58704 | [[fallthrough]]; | |||
| 58705 | // 32-bit fallthrough | |||
| 58706 | case 'Q': // Q_REGS | |||
| 58707 | if (VT == MVT::i8 || VT == MVT::i1) | |||
| 58708 | return std::make_pair(0U, &X86::GR8_ABCD_LRegClass); | |||
| 58709 | if (VT == MVT::i16) | |||
| 58710 | return std::make_pair(0U, &X86::GR16_ABCDRegClass); | |||
| 58711 | if (VT == MVT::i32 || VT == MVT::f32 || | |||
| 58712 | (!VT.isVector() && !Subtarget.is64Bit())) | |||
| 58713 | return std::make_pair(0U, &X86::GR32_ABCDRegClass); | |||
| 58714 | if (VT != MVT::f80 && !VT.isVector()) | |||
| 58715 | return std::make_pair(0U, &X86::GR64_ABCDRegClass); | |||
| 58716 | break; | |||
| 58717 | case 'r': // GENERAL_REGS | |||
| 58718 | case 'l': // INDEX_REGS | |||
| 58719 | if (VT == MVT::i8 || VT == MVT::i1) | |||
| 58720 | return std::make_pair(0U, &X86::GR8RegClass); | |||
| 58721 | if (VT == MVT::i16) | |||
| 58722 | return std::make_pair(0U, &X86::GR16RegClass); | |||
| 58723 | if (VT == MVT::i32 || VT == MVT::f32 || | |||
| 58724 | (!VT.isVector() && !Subtarget.is64Bit())) | |||
| 58725 | return std::make_pair(0U, &X86::GR32RegClass); | |||
| 58726 | if (VT != MVT::f80 && !VT.isVector()) | |||
| 58727 | return std::make_pair(0U, &X86::GR64RegClass); | |||
| 58728 | break; | |||
| 58729 | case 'R': // LEGACY_REGS | |||
| 58730 | if (VT == MVT::i8 || VT == MVT::i1) | |||
| 58731 | return std::make_pair(0U, &X86::GR8_NOREXRegClass); | |||
| 58732 | if (VT == MVT::i16) | |||
| 58733 | return std::make_pair(0U, &X86::GR16_NOREXRegClass); | |||
| 58734 | if (VT == MVT::i32 || VT == MVT::f32 || | |||
| 58735 | (!VT.isVector() && !Subtarget.is64Bit())) | |||
| 58736 | return std::make_pair(0U, &X86::GR32_NOREXRegClass); | |||
| 58737 | if (VT != MVT::f80 && !VT.isVector()) | |||
| 58738 | return std::make_pair(0U, &X86::GR64_NOREXRegClass); | |||
| 58739 | break; | |||
| 58740 | case 'f': // FP Stack registers. | |||
| 58741 | // If SSE is enabled for this VT, use f80 to ensure the isel moves the | |||
| 58742 | // value to the correct fpstack register class. | |||
| 58743 | if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT)) | |||
| 58744 | return std::make_pair(0U, &X86::RFP32RegClass); | |||
| 58745 | if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT)) | |||
| 58746 | return std::make_pair(0U, &X86::RFP64RegClass); | |||
| 58747 | if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) | |||
| 58748 | return std::make_pair(0U, &X86::RFP80RegClass); | |||
| 58749 | break; | |||
| 58750 | case 'y': // MMX_REGS if MMX allowed. | |||
| 58751 | if (!Subtarget.hasMMX()) break; | |||
| 58752 | return std::make_pair(0U, &X86::VR64RegClass); | |||
| 58753 | case 'v': | |||
| 58754 | case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed | |||
| 58755 | if (!Subtarget.hasSSE1()) break; | |||
| 58756 | bool VConstraint = (Constraint[0] == 'v'); | |||
| 58757 | ||||
| 58758 | switch (VT.SimpleTy) { | |||
| 58759 | default: break; | |||
| 58760 | // Scalar SSE types. | |||
| 58761 | case MVT::f16: | |||
| 58762 | if (VConstraint && Subtarget.hasFP16()) | |||
| 58763 | return std::make_pair(0U, &X86::FR16XRegClass); | |||
| 58764 | break; | |||
| 58765 | case MVT::f32: | |||
| 58766 | case MVT::i32: | |||
| 58767 | if (VConstraint && Subtarget.hasVLX()) | |||
| 58768 | return std::make_pair(0U, &X86::FR32XRegClass); | |||
| 58769 | return std::make_pair(0U, &X86::FR32RegClass); | |||
| 58770 | case MVT::f64: | |||
| 58771 | case MVT::i64: | |||
| 58772 | if (VConstraint && Subtarget.hasVLX()) | |||
| 58773 | return std::make_pair(0U, &X86::FR64XRegClass); | |||
| 58774 | return std::make_pair(0U, &X86::FR64RegClass); | |||
| 58775 | case MVT::i128: | |||
| 58776 | if (Subtarget.is64Bit()) { | |||
| 58777 | if (VConstraint && Subtarget.hasVLX()) | |||
| 58778 | return std::make_pair(0U, &X86::VR128XRegClass); | |||
| 58779 | return std::make_pair(0U, &X86::VR128RegClass); | |||
| 58780 | } | |||
| 58781 | break; | |||
| 58782 | // Vector types and fp128. | |||
| 58783 | case MVT::v8f16: | |||
| 58784 | if (!Subtarget.hasFP16()) | |||
| 58785 | break; | |||
| 58786 | [[fallthrough]]; | |||
| 58787 | case MVT::f128: | |||
| 58788 | case MVT::v16i8: | |||
| 58789 | case MVT::v8i16: | |||
| 58790 | case MVT::v4i32: | |||
| 58791 | case MVT::v2i64: | |||
| 58792 | case MVT::v4f32: | |||
| 58793 | case MVT::v2f64: | |||
| 58794 | if (VConstraint && Subtarget.hasVLX()) | |||
| 58795 | return std::make_pair(0U, &X86::VR128XRegClass); | |||
| 58796 | return std::make_pair(0U, &X86::VR128RegClass); | |||
| 58797 | // AVX types. | |||
| 58798 | case MVT::v16f16: | |||
| 58799 | if (!Subtarget.hasFP16()) | |||
| 58800 | break; | |||
| 58801 | [[fallthrough]]; | |||
| 58802 | case MVT::v32i8: | |||
| 58803 | case MVT::v16i16: | |||
| 58804 | case MVT::v8i32: | |||
| 58805 | case MVT::v4i64: | |||
| 58806 | case MVT::v8f32: | |||
| 58807 | case MVT::v4f64: | |||
| 58808 | if (VConstraint && Subtarget.hasVLX()) | |||
| 58809 | return std::make_pair(0U, &X86::VR256XRegClass); | |||
| 58810 | if (Subtarget.hasAVX()) | |||
| 58811 | return std::make_pair(0U, &X86::VR256RegClass); | |||
| 58812 | break; | |||
| 58813 | case MVT::v32f16: | |||
| 58814 | if (!Subtarget.hasFP16()) | |||
| 58815 | break; | |||
| 58816 | [[fallthrough]]; | |||
| 58817 | case MVT::v64i8: | |||
| 58818 | case MVT::v32i16: | |||
| 58819 | case MVT::v8f64: | |||
| 58820 | case MVT::v16f32: | |||
| 58821 | case MVT::v16i32: | |||
| 58822 | case MVT::v8i64: | |||
| 58823 | if (!Subtarget.hasAVX512()) break; | |||
| 58824 | if (VConstraint) | |||
| 58825 | return std::make_pair(0U, &X86::VR512RegClass); | |||
| 58826 | return std::make_pair(0U, &X86::VR512_0_15RegClass); | |||
| 58827 | } | |||
| 58828 | break; | |||
| 58829 | } | |||
| 58830 | } else if (Constraint.size() == 2 && Constraint[0] == 'Y') { | |||
| 58831 | switch (Constraint[1]) { | |||
| 58832 | default: | |||
| 58833 | break; | |||
| 58834 | case 'i': | |||
| 58835 | case 't': | |||
| 58836 | case '2': | |||
| 58837 | return getRegForInlineAsmConstraint(TRI, "x", VT); | |||
| 58838 | case 'm': | |||
| 58839 | if (!Subtarget.hasMMX()) break; | |||
| 58840 | return std::make_pair(0U, &X86::VR64RegClass); | |||
| 58841 | case 'z': | |||
| 58842 | if (!Subtarget.hasSSE1()) break; | |||
| 58843 | switch (VT.SimpleTy) { | |||
| 58844 | default: break; | |||
| 58845 | // Scalar SSE types. | |||
| 58846 | case MVT::f16: | |||
| 58847 | if (!Subtarget.hasFP16()) | |||
| 58848 | break; | |||
| 58849 | return std::make_pair(X86::XMM0, &X86::FR16XRegClass); | |||
| 58850 | case MVT::f32: | |||
| 58851 | case MVT::i32: | |||
| 58852 | return std::make_pair(X86::XMM0, &X86::FR32RegClass); | |||
| 58853 | case MVT::f64: | |||
| 58854 | case MVT::i64: | |||
| 58855 | return std::make_pair(X86::XMM0, &X86::FR64RegClass); | |||
| 58856 | case MVT::v8f16: | |||
| 58857 | if (!Subtarget.hasFP16()) | |||
| 58858 | break; | |||
| 58859 | [[fallthrough]]; | |||
| 58860 | case MVT::f128: | |||
| 58861 | case MVT::v16i8: | |||
| 58862 | case MVT::v8i16: | |||
| 58863 | case MVT::v4i32: | |||
| 58864 | case MVT::v2i64: | |||
| 58865 | case MVT::v4f32: | |||
| 58866 | case MVT::v2f64: | |||
| 58867 | return std::make_pair(X86::XMM0, &X86::VR128RegClass); | |||
| 58868 | // AVX types. | |||
| 58869 | case MVT::v16f16: | |||
| 58870 | if (!Subtarget.hasFP16()) | |||
| 58871 | break; | |||
| 58872 | [[fallthrough]]; | |||
| 58873 | case MVT::v32i8: | |||
| 58874 | case MVT::v16i16: | |||
| 58875 | case MVT::v8i32: | |||
| 58876 | case MVT::v4i64: | |||
| 58877 | case MVT::v8f32: | |||
| 58878 | case MVT::v4f64: | |||
| 58879 | if (Subtarget.hasAVX()) | |||
| 58880 | return std::make_pair(X86::YMM0, &X86::VR256RegClass); | |||
| 58881 | break; | |||
| 58882 | case MVT::v32f16: | |||
| 58883 | if (!Subtarget.hasFP16()) | |||
| 58884 | break; | |||
| 58885 | [[fallthrough]]; | |||
| 58886 | case MVT::v64i8: | |||
| 58887 | case MVT::v32i16: | |||
| 58888 | case MVT::v8f64: | |||
| 58889 | case MVT::v16f32: | |||
| 58890 | case MVT::v16i32: | |||
| 58891 | case MVT::v8i64: | |||
| 58892 | if (Subtarget.hasAVX512()) | |||
| 58893 | return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass); | |||
| 58894 | break; | |||
| 58895 | } | |||
| 58896 | break; | |||
| 58897 | case 'k': | |||
| 58898 | // This register class doesn't allocate k0 for masked vector operation. | |||
| 58899 | if (Subtarget.hasAVX512()) { | |||
| 58900 | if (VT == MVT::i1) | |||
| 58901 | return std::make_pair(0U, &X86::VK1WMRegClass); | |||
| 58902 | if (VT == MVT::i8) | |||
| 58903 | return std::make_pair(0U, &X86::VK8WMRegClass); | |||
| 58904 | if (VT == MVT::i16) | |||
| 58905 | return std::make_pair(0U, &X86::VK16WMRegClass); | |||
| 58906 | } | |||
| 58907 | if (Subtarget.hasBWI()) { | |||
| 58908 | if (VT == MVT::i32) | |||
| 58909 | return std::make_pair(0U, &X86::VK32WMRegClass); | |||
| 58910 | if (VT == MVT::i64) | |||
| 58911 | return std::make_pair(0U, &X86::VK64WMRegClass); | |||
| 58912 | } | |||
| 58913 | break; | |||
| 58914 | } | |||
| 58915 | } | |||
| 58916 | ||||
| 58917 | if (parseConstraintCode(Constraint) != X86::COND_INVALID) | |||
| 58918 | return std::make_pair(0U, &X86::GR32RegClass); | |||
| 58919 | ||||
| 58920 | // Use the default implementation in TargetLowering to convert the register | |||
| 58921 | // constraint into a member of a register class. | |||
| 58922 | std::pair<Register, const TargetRegisterClass*> Res; | |||
| 58923 | Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); | |||
| 58924 | ||||
| 58925 | // Not found as a standard register? | |||
| 58926 | if (!Res.second) { | |||
| 58927 | // Only match x87 registers if the VT is one SelectionDAGBuilder can convert | |||
| 58928 | // to/from f80. | |||
| 58929 | if (VT == MVT::Other || VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80) { | |||
| 58930 | // Map st(0) -> st(7) -> ST0 | |||
| 58931 | if (Constraint.size() == 7 && Constraint[0] == '{' && | |||
| 58932 | tolower(Constraint[1]) == 's' && tolower(Constraint[2]) == 't' && | |||
| 58933 | Constraint[3] == '(' && | |||
| 58934 | (Constraint[4] >= '0' && Constraint[4] <= '7') && | |||
| 58935 | Constraint[5] == ')' && Constraint[6] == '}') { | |||
| 58936 | // st(7) is not allocatable and thus not a member of RFP80. Return | |||
| 58937 | // singleton class in cases where we have a reference to it. | |||
| 58938 | if (Constraint[4] == '7') | |||
| 58939 | return std::make_pair(X86::FP7, &X86::RFP80_7RegClass); | |||
| 58940 | return std::make_pair(X86::FP0 + Constraint[4] - '0', | |||
| 58941 | &X86::RFP80RegClass); | |||
| 58942 | } | |||
| 58943 | ||||
| 58944 | // GCC allows "st(0)" to be called just plain "st". | |||
| 58945 | if (StringRef("{st}").equals_insensitive(Constraint)) | |||
| 58946 | return std::make_pair(X86::FP0, &X86::RFP80RegClass); | |||
| 58947 | } | |||
| 58948 | ||||
| 58949 | // flags -> EFLAGS | |||
| 58950 | if (StringRef("{flags}").equals_insensitive(Constraint)) | |||
| 58951 | return std::make_pair(X86::EFLAGS, &X86::CCRRegClass); | |||
| 58952 | ||||
| 58953 | // dirflag -> DF | |||
| 58954 | // Only allow for clobber. | |||
| 58955 | if (StringRef("{dirflag}").equals_insensitive(Constraint) && | |||
| 58956 | VT == MVT::Other) | |||
| 58957 | return std::make_pair(X86::DF, &X86::DFCCRRegClass); | |||
| 58958 | ||||
| 58959 | // fpsr -> FPSW | |||
| 58960 | if (StringRef("{fpsr}").equals_insensitive(Constraint)) | |||
| 58961 | return std::make_pair(X86::FPSW, &X86::FPCCRRegClass); | |||
| 58962 | ||||
| 58963 | return Res; | |||
| 58964 | } | |||
| 58965 | ||||
| 58966 | // Make sure it isn't a register that requires 64-bit mode. | |||
| 58967 | if (!Subtarget.is64Bit() && | |||
| 58968 | (isFRClass(*Res.second) || isGRClass(*Res.second)) && | |||
| 58969 | TRI->getEncodingValue(Res.first) >= 8) { | |||
| 58970 | // Register requires REX prefix, but we're in 32-bit mode. | |||
| 58971 | return std::make_pair(0, nullptr); | |||
| 58972 | } | |||
| 58973 | ||||
| 58974 | // Make sure it isn't a register that requires AVX512. | |||
| 58975 | if (!Subtarget.hasAVX512() && isFRClass(*Res.second) && | |||
| 58976 | TRI->getEncodingValue(Res.first) & 0x10) { | |||
| 58977 | // Register requires EVEX prefix. | |||
| 58978 | return std::make_pair(0, nullptr); | |||
| 58979 | } | |||
| 58980 | ||||
| 58981 | // Otherwise, check to see if this is a register class of the wrong value | |||
| 58982 | // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to | |||
| 58983 | // turn into {ax},{dx}. | |||
| 58984 | // MVT::Other is used to specify clobber names. | |||
| 58985 | if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other) | |||
| 58986 | return Res; // Correct type already, nothing to do. | |||
| 58987 | ||||
| 58988 | // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should | |||
| 58989 | // return "eax". This should even work for things like getting 64bit integer | |||
| 58990 | // registers when given an f64 type. | |||
| 58991 | const TargetRegisterClass *Class = Res.second; | |||
| 58992 | // The generic code will match the first register class that contains the | |||
| 58993 | // given register. Thus, based on the ordering of the tablegened file, | |||
| 58994 | // the "plain" GR classes might not come first. | |||
| 58995 | // Therefore, use a helper method. | |||
| 58996 | if (isGRClass(*Class)) { | |||
| 58997 | unsigned Size = VT.getSizeInBits(); | |||
| 58998 | if (Size == 1) Size = 8; | |||
| 58999 | if (Size != 8 && Size != 16 && Size != 32 && Size != 64) | |||
| 59000 | return std::make_pair(0, nullptr); | |||
| 59001 | Register DestReg = getX86SubSuperRegister(Res.first, Size); | |||
| 59002 | if (DestReg.isValid()) { | |||
| 59003 | bool is64Bit = Subtarget.is64Bit(); | |||
| 59004 | const TargetRegisterClass *RC = | |||
| 59005 | Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) | |||
| 59006 | : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) | |||
| 59007 | : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) | |||
| 59008 | : /*Size == 64*/ (is64Bit ? &X86::GR64RegClass : nullptr); | |||
| 59009 | if (Size == 64 && !is64Bit) { | |||
| 59010 | // Model GCC's behavior here and select a fixed pair of 32-bit | |||
| 59011 | // registers. | |||
| 59012 | switch (DestReg) { | |||
| 59013 | case X86::RAX: | |||
| 59014 | return std::make_pair(X86::EAX, &X86::GR32_ADRegClass); | |||
| 59015 | case X86::RDX: | |||
| 59016 | return std::make_pair(X86::EDX, &X86::GR32_DCRegClass); | |||
| 59017 | case X86::RCX: | |||
| 59018 | return std::make_pair(X86::ECX, &X86::GR32_CBRegClass); | |||
| 59019 | case X86::RBX: | |||
| 59020 | return std::make_pair(X86::EBX, &X86::GR32_BSIRegClass); | |||
| 59021 | case X86::RSI: | |||
| 59022 | return std::make_pair(X86::ESI, &X86::GR32_SIDIRegClass); | |||
| 59023 | case X86::RDI: | |||
| 59024 | return std::make_pair(X86::EDI, &X86::GR32_DIBPRegClass); | |||
| 59025 | case X86::RBP: | |||
| 59026 | return std::make_pair(X86::EBP, &X86::GR32_BPSPRegClass); | |||
| 59027 | default: | |||
| 59028 | return std::make_pair(0, nullptr); | |||
| 59029 | } | |||
| 59030 | } | |||
| 59031 | if (RC && RC->contains(DestReg)) | |||
| 59032 | return std::make_pair(DestReg, RC); | |||
| 59033 | return Res; | |||
| 59034 | } | |||
| 59035 | // No register found/type mismatch. | |||
| 59036 | return std::make_pair(0, nullptr); | |||
| 59037 | } else if (isFRClass(*Class)) { | |||
| 59038 | // Handle references to XMM physical registers that got mapped into the | |||
| 59039 | // wrong class. This can happen with constraints like {xmm0} where the | |||
| 59040 | // target independent register mapper will just pick the first match it can | |||
| 59041 | // find, ignoring the required type. | |||
| 59042 | ||||
| 59043 | // TODO: Handle f128 and i128 in FR128RegClass after it is tested well. | |||
| 59044 | if (VT == MVT::f16) | |||
| 59045 | Res.second = &X86::FR16XRegClass; | |||
| 59046 | else if (VT == MVT::f32 || VT == MVT::i32) | |||
| 59047 | Res.second = &X86::FR32XRegClass; | |||
| 59048 | else if (VT == MVT::f64 || VT == MVT::i64) | |||
| 59049 | Res.second = &X86::FR64XRegClass; | |||
| 59050 | else if (TRI->isTypeLegalForClass(X86::VR128XRegClass, VT)) | |||
| 59051 | Res.second = &X86::VR128XRegClass; | |||
| 59052 | else if (TRI->isTypeLegalForClass(X86::VR256XRegClass, VT)) | |||
| 59053 | Res.second = &X86::VR256XRegClass; | |||
| 59054 | else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT)) | |||
| 59055 | Res.second = &X86::VR512RegClass; | |||
| 59056 | else { | |||
| 59057 | // Type mismatch and not a clobber: Return an error; | |||
| 59058 | Res.first = 0; | |||
| 59059 | Res.second = nullptr; | |||
| 59060 | } | |||
| 59061 | } else if (isVKClass(*Class)) { | |||
| 59062 | if (VT == MVT::i1) | |||
| 59063 | Res.second = &X86::VK1RegClass; | |||
| 59064 | else if (VT == MVT::i8) | |||
| 59065 | Res.second = &X86::VK8RegClass; | |||
| 59066 | else if (VT == MVT::i16) | |||
| 59067 | Res.second = &X86::VK16RegClass; | |||
| 59068 | else if (VT == MVT::i32) | |||
| 59069 | Res.second = &X86::VK32RegClass; | |||
| 59070 | else if (VT == MVT::i64) | |||
| 59071 | Res.second = &X86::VK64RegClass; | |||
| 59072 | else { | |||
| 59073 | // Type mismatch and not a clobber: Return an error; | |||
| 59074 | Res.first = 0; | |||
| 59075 | Res.second = nullptr; | |||
| 59076 | } | |||
| 59077 | } | |||
| 59078 | ||||
| 59079 | return Res; | |||
| 59080 | } | |||
| 59081 | ||||
| 59082 | bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { | |||
| 59083 | // Integer division on x86 is expensive. However, when aggressively optimizing | |||
| 59084 | // for code size, we prefer to use a div instruction, as it is usually smaller | |||
| 59085 | // than the alternative sequence. | |||
| 59086 | // The exception to this is vector division. Since x86 doesn't have vector | |||
| 59087 | // integer division, leaving the division as-is is a loss even in terms of | |||
| 59088 | // size, because it will have to be scalarized, while the alternative code | |||
| 59089 | // sequence can be performed in vector form. | |||
| 59090 | bool OptSize = Attr.hasFnAttr(Attribute::MinSize); | |||
| 59091 | return OptSize && !VT.isVector(); | |||
| 59092 | } | |||
| 59093 | ||||
| 59094 | void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const { | |||
| 59095 | if (!Subtarget.is64Bit()) | |||
| 59096 | return; | |||
| 59097 | ||||
| 59098 | // Update IsSplitCSR in X86MachineFunctionInfo. | |||
| 59099 | X86MachineFunctionInfo *AFI = | |||
| 59100 | Entry->getParent()->getInfo<X86MachineFunctionInfo>(); | |||
| 59101 | AFI->setIsSplitCSR(true); | |||
| 59102 | } | |||
| 59103 | ||||
| 59104 | void X86TargetLowering::insertCopiesSplitCSR( | |||
| 59105 | MachineBasicBlock *Entry, | |||
| 59106 | const SmallVectorImpl<MachineBasicBlock *> &Exits) const { | |||
| 59107 | const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); | |||
| 59108 | const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent()); | |||
| 59109 | if (!IStart) | |||
| 59110 | return; | |||
| 59111 | ||||
| 59112 | const TargetInstrInfo *TII = Subtarget.getInstrInfo(); | |||
| 59113 | MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo(); | |||
| 59114 | MachineBasicBlock::iterator MBBI = Entry->begin(); | |||
| 59115 | for (const MCPhysReg *I = IStart; *I; ++I) { | |||
| 59116 | const TargetRegisterClass *RC = nullptr; | |||
| 59117 | if (X86::GR64RegClass.contains(*I)) | |||
| 59118 | RC = &X86::GR64RegClass; | |||
| 59119 | else | |||
| 59120 | llvm_unreachable("Unexpected register class in CSRsViaCopy!")::llvm::llvm_unreachable_internal("Unexpected register class in CSRsViaCopy!" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59120); | |||
| 59121 | ||||
| 59122 | Register NewVR = MRI->createVirtualRegister(RC); | |||
| 59123 | // Create copy from CSR to a virtual register. | |||
| 59124 | // FIXME: this currently does not emit CFI pseudo-instructions, it works | |||
| 59125 | // fine for CXX_FAST_TLS since the C++-style TLS access functions should be | |||
| 59126 | // nounwind. If we want to generalize this later, we may need to emit | |||
| 59127 | // CFI pseudo-instructions. | |||
| 59128 | assert((static_cast <bool> (Entry->getParent()->getFunction ().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__ __PRETTY_FUNCTION__)) | |||
| 59129 | Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) &&(static_cast <bool> (Entry->getParent()->getFunction ().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__ __PRETTY_FUNCTION__)) | |||
| 59130 | "Function should be nounwind in insertCopiesSplitCSR!")(static_cast <bool> (Entry->getParent()->getFunction ().hasFnAttribute(Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!" ) ? void (0) : __assert_fail ("Entry->getParent()->getFunction().hasFnAttribute(Attribute::NoUnwind) && \"Function should be nounwind in insertCopiesSplitCSR!\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59130, __extension__ __PRETTY_FUNCTION__)); | |||
| 59131 | Entry->addLiveIn(*I); | |||
| 59132 | BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR) | |||
| 59133 | .addReg(*I); | |||
| 59134 | ||||
| 59135 | // Insert the copy-back instructions right before the terminator. | |||
| 59136 | for (auto *Exit : Exits) | |||
| 59137 | BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(), | |||
| 59138 | TII->get(TargetOpcode::COPY), *I) | |||
| 59139 | .addReg(NewVR); | |||
| 59140 | } | |||
| 59141 | } | |||
| 59142 | ||||
| 59143 | bool X86TargetLowering::supportSwiftError() const { | |||
| 59144 | return Subtarget.is64Bit(); | |||
| 59145 | } | |||
| 59146 | ||||
| 59147 | MachineInstr * | |||
| 59148 | X86TargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, | |||
| 59149 | MachineBasicBlock::instr_iterator &MBBI, | |||
| 59150 | const TargetInstrInfo *TII) const { | |||
| 59151 | assert(MBBI->isCall() && MBBI->getCFIType() &&(static_cast <bool> (MBBI->isCall() && MBBI-> getCFIType() && "Invalid call instruction for a KCFI check" ) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__ __PRETTY_FUNCTION__)) | |||
| 59152 | "Invalid call instruction for a KCFI check")(static_cast <bool> (MBBI->isCall() && MBBI-> getCFIType() && "Invalid call instruction for a KCFI check" ) ? void (0) : __assert_fail ("MBBI->isCall() && MBBI->getCFIType() && \"Invalid call instruction for a KCFI check\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59152, __extension__ __PRETTY_FUNCTION__)); | |||
| 59153 | ||||
| 59154 | MachineFunction &MF = *MBB.getParent(); | |||
| 59155 | // If the call target is a memory operand, unfold it and use R11 for the | |||
| 59156 | // call, so KCFI_CHECK won't have to recompute the address. | |||
| 59157 | switch (MBBI->getOpcode()) { | |||
| 59158 | case X86::CALL64m: | |||
| 59159 | case X86::CALL64m_NT: | |||
| 59160 | case X86::TAILJMPm64: | |||
| 59161 | case X86::TAILJMPm64_REX: { | |||
| 59162 | MachineBasicBlock::instr_iterator OrigCall = MBBI; | |||
| 59163 | SmallVector<MachineInstr *, 2> NewMIs; | |||
| 59164 | if (!TII->unfoldMemoryOperand(MF, *OrigCall, X86::R11, /*UnfoldLoad=*/true, | |||
| 59165 | /*UnfoldStore=*/false, NewMIs)) | |||
| 59166 | report_fatal_error("Failed to unfold memory operand for a KCFI check"); | |||
| 59167 | for (auto *NewMI : NewMIs) | |||
| 59168 | MBBI = MBB.insert(OrigCall, NewMI); | |||
| 59169 | assert(MBBI->isCall() &&(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding" ) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__ __PRETTY_FUNCTION__)) | |||
| 59170 | "Unexpected instruction after memory operand unfolding")(static_cast <bool> (MBBI->isCall() && "Unexpected instruction after memory operand unfolding" ) ? void (0) : __assert_fail ("MBBI->isCall() && \"Unexpected instruction after memory operand unfolding\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59170, __extension__ __PRETTY_FUNCTION__)); | |||
| 59171 | if (OrigCall->shouldUpdateCallSiteInfo()) | |||
| 59172 | MF.moveCallSiteInfo(&*OrigCall, &*MBBI); | |||
| 59173 | MBBI->setCFIType(MF, OrigCall->getCFIType()); | |||
| 59174 | OrigCall->eraseFromParent(); | |||
| 59175 | break; | |||
| 59176 | } | |||
| 59177 | default: | |||
| 59178 | break; | |||
| 59179 | } | |||
| 59180 | ||||
| 59181 | MachineOperand &Target = MBBI->getOperand(0); | |||
| 59182 | Register TargetReg; | |||
| 59183 | switch (MBBI->getOpcode()) { | |||
| 59184 | case X86::CALL64r: | |||
| 59185 | case X86::CALL64r_NT: | |||
| 59186 | case X86::TAILJMPr64: | |||
| 59187 | case X86::TAILJMPr64_REX: | |||
| 59188 | assert(Target.isReg() && "Unexpected target operand for an indirect call")(static_cast <bool> (Target.isReg() && "Unexpected target operand for an indirect call" ) ? void (0) : __assert_fail ("Target.isReg() && \"Unexpected target operand for an indirect call\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59188, __extension__ __PRETTY_FUNCTION__)); | |||
| 59189 | Target.setIsRenamable(false); | |||
| 59190 | TargetReg = Target.getReg(); | |||
| 59191 | break; | |||
| 59192 | case X86::CALL64pcrel32: | |||
| 59193 | case X86::TAILJMPd64: | |||
| 59194 | assert(Target.isSymbol() && "Unexpected target operand for a direct call")(static_cast <bool> (Target.isSymbol() && "Unexpected target operand for a direct call" ) ? void (0) : __assert_fail ("Target.isSymbol() && \"Unexpected target operand for a direct call\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59194, __extension__ __PRETTY_FUNCTION__)); | |||
| 59195 | // X86TargetLowering::EmitLoweredIndirectThunk always uses r11 for | |||
| 59196 | // 64-bit indirect thunk calls. | |||
| 59197 | assert(StringRef(Target.getSymbolName()).endswith("_r11") &&(static_cast <bool> (StringRef(Target.getSymbolName()). endswith("_r11") && "Unexpected register for an indirect thunk call" ) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__ __PRETTY_FUNCTION__)) | |||
| 59198 | "Unexpected register for an indirect thunk call")(static_cast <bool> (StringRef(Target.getSymbolName()). endswith("_r11") && "Unexpected register for an indirect thunk call" ) ? void (0) : __assert_fail ("StringRef(Target.getSymbolName()).endswith(\"_r11\") && \"Unexpected register for an indirect thunk call\"" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59198, __extension__ __PRETTY_FUNCTION__)); | |||
| 59199 | TargetReg = X86::R11; | |||
| 59200 | break; | |||
| 59201 | default: | |||
| 59202 | llvm_unreachable("Unexpected CFI call opcode")::llvm::llvm_unreachable_internal("Unexpected CFI call opcode" , "llvm/lib/Target/X86/X86ISelLowering.cpp", 59202); | |||
| 59203 | break; | |||
| 59204 | } | |||
| 59205 | ||||
| 59206 | return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(X86::KCFI_CHECK)) | |||
| 59207 | .addReg(TargetReg) | |||
| 59208 | .addImm(MBBI->getCFIType()) | |||
| 59209 | .getInstr(); | |||
| 59210 | } | |||
| 59211 | ||||
| 59212 | /// Returns true if stack probing through a function call is requested. | |||
| 59213 | bool X86TargetLowering::hasStackProbeSymbol(const MachineFunction &MF) const { | |||
| 59214 | return !getStackProbeSymbolName(MF).empty(); | |||
| 59215 | } | |||
| 59216 | ||||
| 59217 | /// Returns true if stack probing through inline assembly is requested. | |||
| 59218 | bool X86TargetLowering::hasInlineStackProbe(const MachineFunction &MF) const { | |||
| 59219 | ||||
| 59220 | // No inline stack probe for Windows, they have their own mechanism. | |||
| 59221 | if (Subtarget.isOSWindows() || | |||
| 59222 | MF.getFunction().hasFnAttribute("no-stack-arg-probe")) | |||
| 59223 | return false; | |||
| 59224 | ||||
| 59225 | // If the function specifically requests inline stack probes, emit them. | |||
| 59226 | if (MF.getFunction().hasFnAttribute("probe-stack")) | |||
| 59227 | return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() == | |||
| 59228 | "inline-asm"; | |||
| 59229 | ||||
| 59230 | return false; | |||
| 59231 | } | |||
| 59232 | ||||
| 59233 | /// Returns the name of the symbol used to emit stack probes or the empty | |||
| 59234 | /// string if not applicable. | |||
| 59235 | StringRef | |||
| 59236 | X86TargetLowering::getStackProbeSymbolName(const MachineFunction &MF) const { | |||
| 59237 | // Inline Stack probes disable stack probe call | |||
| 59238 | if (hasInlineStackProbe(MF)) | |||
| 59239 | return ""; | |||
| 59240 | ||||
| 59241 | // If the function specifically requests stack probes, emit them. | |||
| 59242 | if (MF.getFunction().hasFnAttribute("probe-stack")) | |||
| 59243 | return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); | |||
| 59244 | ||||
| 59245 | // Generally, if we aren't on Windows, the platform ABI does not include | |||
| 59246 | // support for stack probes, so don't emit them. | |||
| 59247 | if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO() || | |||
| 59248 | MF.getFunction().hasFnAttribute("no-stack-arg-probe")) | |||
| 59249 | return ""; | |||
| 59250 | ||||
| 59251 | // We need a stack probe to conform to the Windows ABI. Choose the right | |||
| 59252 | // symbol. | |||
| 59253 | if (Subtarget.is64Bit()) | |||
| 59254 | return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk"; | |||
| 59255 | return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk"; | |||
| 59256 | } | |||
| 59257 | ||||
| 59258 | unsigned | |||
| 59259 | X86TargetLowering::getStackProbeSize(const MachineFunction &MF) const { | |||
| 59260 | // The default stack probe size is 4096 if the function has no stackprobesize | |||
| 59261 | // attribute. | |||
| 59262 | return MF.getFunction().getFnAttributeAsParsedInteger("stack-probe-size", | |||
| 59263 | 4096); | |||
| 59264 | } | |||
| 59265 | ||||
| 59266 | Align X86TargetLowering::getPrefLoopAlignment(MachineLoop *ML) const { | |||
| 59267 | if (ML->isInnermost() && | |||
| 59268 | ExperimentalPrefInnermostLoopAlignment.getNumOccurrences()) | |||
| 59269 | return Align(1ULL << ExperimentalPrefInnermostLoopAlignment); | |||
| 59270 | return TargetLowering::getPrefLoopAlignment(); | |||
| 59271 | } |