LLVM  mainline
X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86FrameLowering.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86MachineFunctionInfo.h"
00021 #include "X86TargetMachine.h"
00022 #include "X86TargetObjectFile.h"
00023 #include "llvm/ADT/SmallBitVector.h"
00024 #include "llvm/ADT/SmallSet.h"
00025 #include "llvm/ADT/Statistic.h"
00026 #include "llvm/ADT/StringExtras.h"
00027 #include "llvm/ADT/StringSwitch.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/CodeGen/WinEHFuncInfo.h"
00036 #include "llvm/IR/CallSite.h"
00037 #include "llvm/IR/CallingConv.h"
00038 #include "llvm/IR/Constants.h"
00039 #include "llvm/IR/DerivedTypes.h"
00040 #include "llvm/IR/Function.h"
00041 #include "llvm/IR/GlobalAlias.h"
00042 #include "llvm/IR/GlobalVariable.h"
00043 #include "llvm/IR/Instructions.h"
00044 #include "llvm/IR/Intrinsics.h"
00045 #include "llvm/MC/MCAsmInfo.h"
00046 #include "llvm/MC/MCContext.h"
00047 #include "llvm/MC/MCExpr.h"
00048 #include "llvm/MC/MCSymbol.h"
00049 #include "llvm/Support/CommandLine.h"
00050 #include "llvm/Support/Debug.h"
00051 #include "llvm/Support/ErrorHandling.h"
00052 #include "llvm/Support/MathExtras.h"
00053 #include "llvm/Target/TargetOptions.h"
00054 #include "X86IntrinsicsInfo.h"
00055 #include <bitset>
00056 #include <numeric>
00057 #include <cctype>
00058 using namespace llvm;
00059 
00060 #define DEBUG_TYPE "x86-isel"
00061 
00062 STATISTIC(NumTailCalls, "Number of tail calls");
00063 
00064 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00065     "x86-experimental-vector-widening-legalization", cl::init(false),
00066     cl::desc("Enable an experimental vector type legalization through widening "
00067              "rather than promotion."),
00068     cl::Hidden);
00069 
00070 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00071     "x86-recip-refinement-steps", cl::init(1),
00072     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00073              "result of the hardware reciprocal estimate instruction."),
00074     cl::NotHidden);
00075 
00076 // Forward declarations.
00077 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00078                        SDValue V2);
00079 
00080 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
00081                                      const X86Subtarget &STI)
00082     : TargetLowering(TM), Subtarget(&STI) {
00083   X86ScalarSSEf64 = Subtarget->hasSSE2();
00084   X86ScalarSSEf32 = Subtarget->hasSSE1();
00085   TD = getDataLayout();
00086 
00087   // Set up the TargetLowering object.
00088   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00089 
00090   // X86 is weird. It always uses i8 for shift amounts and setcc results.
00091   setBooleanContents(ZeroOrOneBooleanContent);
00092   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00093   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00094 
00095   // For 64-bit, since we have so many registers, use the ILP scheduler.
00096   // For 32-bit, use the register pressure specific scheduling.
00097   // For Atom, always use ILP scheduling.
00098   if (Subtarget->isAtom())
00099     setSchedulingPreference(Sched::ILP);
00100   else if (Subtarget->is64Bit())
00101     setSchedulingPreference(Sched::ILP);
00102   else
00103     setSchedulingPreference(Sched::RegPressure);
00104   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
00105   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00106 
00107   // Bypass expensive divides on Atom when compiling with O2.
00108   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00109     if (Subtarget->hasSlowDivide32())
00110       addBypassSlowDiv(32, 8);
00111     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00112       addBypassSlowDiv(64, 16);
00113   }
00114 
00115   if (Subtarget->isTargetKnownWindowsMSVC()) {
00116     // Setup Windows compiler runtime calls.
00117     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00118     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00119     setLibcallName(RTLIB::SREM_I64, "_allrem");
00120     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00121     setLibcallName(RTLIB::MUL_I64, "_allmul");
00122     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00123     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00124     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00125     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00126     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00127 
00128     // The _ftol2 runtime function has an unusual calling conv, which
00129     // is modeled by a special pseudo-instruction.
00130     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00131     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00132     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00133     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00134   }
00135 
00136   if (Subtarget->isTargetDarwin()) {
00137     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00138     setUseUnderscoreSetJmp(false);
00139     setUseUnderscoreLongJmp(false);
00140   } else if (Subtarget->isTargetWindowsGNU()) {
00141     // MS runtime is weird: it exports _setjmp, but longjmp!
00142     setUseUnderscoreSetJmp(true);
00143     setUseUnderscoreLongJmp(false);
00144   } else {
00145     setUseUnderscoreSetJmp(true);
00146     setUseUnderscoreLongJmp(true);
00147   }
00148 
00149   // Set up the register classes.
00150   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00151   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00152   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00153   if (Subtarget->is64Bit())
00154     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00155 
00156   for (MVT VT : MVT::integer_valuetypes())
00157     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
00158 
00159   // We don't accept any truncstore of integer registers.
00160   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00161   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00162   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00163   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00164   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00165   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00166 
00167   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00168 
00169   // SETOEQ and SETUNE require checking two conditions.
00170   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00171   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00172   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00173   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00174   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00175   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00176 
00177   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00178   // operation.
00179   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00180   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00181   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00182 
00183   if (Subtarget->is64Bit()) {
00184     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00185     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00186   } else if (!TM.Options.UseSoftFloat) {
00187     // We have an algorithm for SSE2->double, and we turn this into a
00188     // 64-bit FILD followed by conditional FADD for other targets.
00189     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00190     // We have an algorithm for SSE2, and we turn this into a 64-bit
00191     // FILD for other targets.
00192     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00193   }
00194 
00195   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00196   // this operation.
00197   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00198   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00199 
00200   if (!TM.Options.UseSoftFloat) {
00201     // SSE has no i16 to fp conversion, only i32
00202     if (X86ScalarSSEf32) {
00203       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00204       // f32 and f64 cases are Legal, f80 case is not
00205       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00206     } else {
00207       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00208       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00209     }
00210   } else {
00211     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00212     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00213   }
00214 
00215   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00216   // are Legal, f80 is custom lowered.
00217   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00218   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00219 
00220   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00221   // this operation.
00222   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00223   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00224 
00225   if (X86ScalarSSEf32) {
00226     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00227     // f32 and f64 cases are Legal, f80 case is not
00228     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00229   } else {
00230     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00231     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00232   }
00233 
00234   // Handle FP_TO_UINT by promoting the destination to a larger signed
00235   // conversion.
00236   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00237   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00238   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00239 
00240   if (Subtarget->is64Bit()) {
00241     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00242     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00243   } else if (!TM.Options.UseSoftFloat) {
00244     // Since AVX is a superset of SSE3, only check for SSE here.
00245     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00246       // Expand FP_TO_UINT into a select.
00247       // FIXME: We would like to use a Custom expander here eventually to do
00248       // the optimal thing for SSE vs. the default expansion in the legalizer.
00249       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00250     else
00251       // With SSE3 we can use fisttpll to convert to a signed i64; without
00252       // SSE, we're stuck with a fistpll.
00253       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00254   }
00255 
00256   if (isTargetFTOL()) {
00257     // Use the _ftol2 runtime function, which has a pseudo-instruction
00258     // to handle its weird calling convention.
00259     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00260   }
00261 
00262   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00263   if (!X86ScalarSSEf64) {
00264     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00265     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00266     if (Subtarget->is64Bit()) {
00267       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00268       // Without SSE, i64->f64 goes through memory.
00269       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00270     }
00271   }
00272 
00273   // Scalar integer divide and remainder are lowered to use operations that
00274   // produce two results, to match the available instructions. This exposes
00275   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00276   // into a single instruction.
00277   //
00278   // Scalar integer multiply-high is also lowered to use two-result
00279   // operations, to match the available instructions. However, plain multiply
00280   // (low) operations are left as Legal, as there are single-result
00281   // instructions for this in x86. Using the two-result multiply instructions
00282   // when both high and low results are needed must be arranged by dagcombine.
00283   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00284     MVT VT = IntVTs[i];
00285     setOperationAction(ISD::MULHS, VT, Expand);
00286     setOperationAction(ISD::MULHU, VT, Expand);
00287     setOperationAction(ISD::SDIV, VT, Expand);
00288     setOperationAction(ISD::UDIV, VT, Expand);
00289     setOperationAction(ISD::SREM, VT, Expand);
00290     setOperationAction(ISD::UREM, VT, Expand);
00291 
00292     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00293     setOperationAction(ISD::ADDC, VT, Custom);
00294     setOperationAction(ISD::ADDE, VT, Custom);
00295     setOperationAction(ISD::SUBC, VT, Custom);
00296     setOperationAction(ISD::SUBE, VT, Custom);
00297   }
00298 
00299   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00300   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00301   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00302   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00303   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00304   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00305   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00306   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00307   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00308   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00309   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00310   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00311   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00312   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00313   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00314   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00315   if (Subtarget->is64Bit())
00316     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00317   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00318   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00319   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00320   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00321   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00322   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00323   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00324   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00325 
00326   // Promote the i8 variants and force them on up to i32 which has a shorter
00327   // encoding.
00328   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00329   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00330   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00331   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00332   if (Subtarget->hasBMI()) {
00333     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00334     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00335     if (Subtarget->is64Bit())
00336       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00337   } else {
00338     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00339     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00340     if (Subtarget->is64Bit())
00341       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00342   }
00343 
00344   if (Subtarget->hasLZCNT()) {
00345     // When promoting the i8 variants, force them to i32 for a shorter
00346     // encoding.
00347     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00348     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00349     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00350     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00351     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00352     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00353     if (Subtarget->is64Bit())
00354       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00355   } else {
00356     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00357     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00358     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00359     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00360     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00361     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00362     if (Subtarget->is64Bit()) {
00363       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00364       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00365     }
00366   }
00367 
00368   // Special handling for half-precision floating point conversions.
00369   // If we don't have F16C support, then lower half float conversions
00370   // into library calls.
00371   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00372     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00373     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00374   }
00375 
00376   // There's never any support for operations beyond MVT::f32.
00377   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00378   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00379   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00380   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00381 
00382   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
00383   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
00384   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
00385   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00386   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00387   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00388 
00389   if (Subtarget->hasPOPCNT()) {
00390     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00391   } else {
00392     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00393     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00394     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00395     if (Subtarget->is64Bit())
00396       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00397   }
00398 
00399   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00400 
00401   if (!Subtarget->hasMOVBE())
00402     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00403 
00404   // These should be promoted to a larger select which is supported.
00405   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00406   // X86 wants to expand cmov itself.
00407   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00408   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00409   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00410   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00411   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00412   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00413   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00414   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00415   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00416   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00417   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00418   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00419   if (Subtarget->is64Bit()) {
00420     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00421     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00422   }
00423   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00424   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00425   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00426   // support continuation, user-level threading, and etc.. As a result, no
00427   // other SjLj exception interfaces are implemented and please don't build
00428   // your own exception handling based on them.
00429   // LLVM/Clang supports zero-cost DWARF exception handling.
00430   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00431   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00432 
00433   // Darwin ABI issue.
00434   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00435   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00436   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00437   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00438   if (Subtarget->is64Bit())
00439     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00440   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00441   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00442   if (Subtarget->is64Bit()) {
00443     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00444     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00445     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00446     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00447     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00448   }
00449   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00450   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00451   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00452   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00453   if (Subtarget->is64Bit()) {
00454     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00455     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00456     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00457   }
00458 
00459   if (Subtarget->hasSSE1())
00460     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00461 
00462   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00463 
00464   // Expand certain atomics
00465   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00466     MVT VT = IntVTs[i];
00467     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00468     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00469     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00470   }
00471 
00472   if (Subtarget->hasCmpxchg16b()) {
00473     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00474   }
00475 
00476   // FIXME - use subtarget debug flags
00477   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00478       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00479     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00480   }
00481 
00482   if (Subtarget->is64Bit()) {
00483     setExceptionPointerRegister(X86::RAX);
00484     setExceptionSelectorRegister(X86::RDX);
00485   } else {
00486     setExceptionPointerRegister(X86::EAX);
00487     setExceptionSelectorRegister(X86::EDX);
00488   }
00489   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00490   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00491 
00492   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00493   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00494 
00495   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00496   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00497 
00498   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00499   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00500   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00501   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00502     // TargetInfo::X86_64ABIBuiltinVaList
00503     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00504     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00505   } else {
00506     // TargetInfo::CharPtrBuiltinVaList
00507     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00508     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00509   }
00510 
00511   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00512   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00513 
00514   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00515 
00516   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00517     // f32 and f64 use SSE.
00518     // Set up the FP register classes.
00519     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00520     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00521 
00522     // Use ANDPD to simulate FABS.
00523     setOperationAction(ISD::FABS , MVT::f64, Custom);
00524     setOperationAction(ISD::FABS , MVT::f32, Custom);
00525 
00526     // Use XORP to simulate FNEG.
00527     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00528     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00529 
00530     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00531     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00532     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00533 
00534     // Lower this to FGETSIGNx86 plus an AND.
00535     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00536     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00537 
00538     // We don't support sin/cos/fmod
00539     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00540     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00541     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00542     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00543     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00544     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00545 
00546     // Expand FP immediates into loads from the stack, except for the special
00547     // cases we handle.
00548     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00549     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00550   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00551     // Use SSE for f32, x87 for f64.
00552     // Set up the FP register classes.
00553     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00554     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00555 
00556     // Use ANDPS to simulate FABS.
00557     setOperationAction(ISD::FABS , MVT::f32, Custom);
00558 
00559     // Use XORP to simulate FNEG.
00560     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00561 
00562     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00563 
00564     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00565     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00566     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00567 
00568     // We don't support sin/cos/fmod
00569     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00570     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00571     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00572 
00573     // Special cases we handle for FP constants.
00574     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00575     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00576     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00577     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00578     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00579 
00580     if (!TM.Options.UnsafeFPMath) {
00581       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00582       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00583       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00584     }
00585   } else if (!TM.Options.UseSoftFloat) {
00586     // f32 and f64 in x87.
00587     // Set up the FP register classes.
00588     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00589     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00590 
00591     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00592     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00593     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00594     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00595 
00596     if (!TM.Options.UnsafeFPMath) {
00597       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00598       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00599       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00600       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00601       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00602       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00603     }
00604     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00605     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00606     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00607     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00608     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00609     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00610     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00611     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00612   }
00613 
00614   // We don't support FMA.
00615   setOperationAction(ISD::FMA, MVT::f64, Expand);
00616   setOperationAction(ISD::FMA, MVT::f32, Expand);
00617 
00618   // Long double always uses X87.
00619   if (!TM.Options.UseSoftFloat) {
00620     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00621     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00622     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00623     {
00624       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00625       addLegalFPImmediate(TmpFlt);  // FLD0
00626       TmpFlt.changeSign();
00627       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00628 
00629       bool ignored;
00630       APFloat TmpFlt2(+1.0);
00631       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00632                       &ignored);
00633       addLegalFPImmediate(TmpFlt2);  // FLD1
00634       TmpFlt2.changeSign();
00635       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00636     }
00637 
00638     if (!TM.Options.UnsafeFPMath) {
00639       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00640       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00641       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00642     }
00643 
00644     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00645     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00646     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00647     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00648     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00649     setOperationAction(ISD::FMA, MVT::f80, Expand);
00650   }
00651 
00652   // Always use a library call for pow.
00653   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00654   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00655   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00656 
00657   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00658   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00659   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00660   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00661   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00662   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00663   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00664 
00665   // First set operation action for all vector types to either promote
00666   // (for widening) or expand (for scalarization). Then we will selectively
00667   // turn on ones that can be effectively codegen'd.
00668   for (MVT VT : MVT::vector_valuetypes()) {
00669     setOperationAction(ISD::ADD , VT, Expand);
00670     setOperationAction(ISD::SUB , VT, Expand);
00671     setOperationAction(ISD::FADD, VT, Expand);
00672     setOperationAction(ISD::FNEG, VT, Expand);
00673     setOperationAction(ISD::FSUB, VT, Expand);
00674     setOperationAction(ISD::MUL , VT, Expand);
00675     setOperationAction(ISD::FMUL, VT, Expand);
00676     setOperationAction(ISD::SDIV, VT, Expand);
00677     setOperationAction(ISD::UDIV, VT, Expand);
00678     setOperationAction(ISD::FDIV, VT, Expand);
00679     setOperationAction(ISD::SREM, VT, Expand);
00680     setOperationAction(ISD::UREM, VT, Expand);
00681     setOperationAction(ISD::LOAD, VT, Expand);
00682     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00683     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00684     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00685     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00686     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00687     setOperationAction(ISD::FABS, VT, Expand);
00688     setOperationAction(ISD::FSIN, VT, Expand);
00689     setOperationAction(ISD::FSINCOS, VT, Expand);
00690     setOperationAction(ISD::FCOS, VT, Expand);
00691     setOperationAction(ISD::FSINCOS, VT, Expand);
00692     setOperationAction(ISD::FREM, VT, Expand);
00693     setOperationAction(ISD::FMA,  VT, Expand);
00694     setOperationAction(ISD::FPOWI, VT, Expand);
00695     setOperationAction(ISD::FSQRT, VT, Expand);
00696     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00697     setOperationAction(ISD::FFLOOR, VT, Expand);
00698     setOperationAction(ISD::FCEIL, VT, Expand);
00699     setOperationAction(ISD::FTRUNC, VT, Expand);
00700     setOperationAction(ISD::FRINT, VT, Expand);
00701     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00702     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00703     setOperationAction(ISD::MULHS, VT, Expand);
00704     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00705     setOperationAction(ISD::MULHU, VT, Expand);
00706     setOperationAction(ISD::SDIVREM, VT, Expand);
00707     setOperationAction(ISD::UDIVREM, VT, Expand);
00708     setOperationAction(ISD::FPOW, VT, Expand);
00709     setOperationAction(ISD::CTPOP, VT, Expand);
00710     setOperationAction(ISD::CTTZ, VT, Expand);
00711     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00712     setOperationAction(ISD::CTLZ, VT, Expand);
00713     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00714     setOperationAction(ISD::SHL, VT, Expand);
00715     setOperationAction(ISD::SRA, VT, Expand);
00716     setOperationAction(ISD::SRL, VT, Expand);
00717     setOperationAction(ISD::ROTL, VT, Expand);
00718     setOperationAction(ISD::ROTR, VT, Expand);
00719     setOperationAction(ISD::BSWAP, VT, Expand);
00720     setOperationAction(ISD::SETCC, VT, Expand);
00721     setOperationAction(ISD::FLOG, VT, Expand);
00722     setOperationAction(ISD::FLOG2, VT, Expand);
00723     setOperationAction(ISD::FLOG10, VT, Expand);
00724     setOperationAction(ISD::FEXP, VT, Expand);
00725     setOperationAction(ISD::FEXP2, VT, Expand);
00726     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00727     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00728     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00729     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00730     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00731     setOperationAction(ISD::TRUNCATE, VT, Expand);
00732     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00733     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00734     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00735     setOperationAction(ISD::VSELECT, VT, Expand);
00736     setOperationAction(ISD::SELECT_CC, VT, Expand);
00737     for (MVT InnerVT : MVT::vector_valuetypes()) {
00738       setTruncStoreAction(InnerVT, VT, Expand);
00739 
00740       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
00741       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
00742 
00743       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
00744       // types, we have to deal with them whether we ask for Expansion or not.
00745       // Setting Expand causes its own optimisation problems though, so leave
00746       // them legal.
00747       if (VT.getVectorElementType() == MVT::i1)
00748         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
00749     }
00750   }
00751 
00752   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00753   // with -msoft-float, disable use of MMX as well.
00754   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00755     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00756     // No operations on x86mmx supported, everything uses intrinsics.
00757   }
00758 
00759   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00760   // into smaller operations.
00761   for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
00762     setOperationAction(ISD::MULHS,              MMXTy,      Expand);
00763     setOperationAction(ISD::AND,                MMXTy,      Expand);
00764     setOperationAction(ISD::OR,                 MMXTy,      Expand);
00765     setOperationAction(ISD::XOR,                MMXTy,      Expand);
00766     setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
00767     setOperationAction(ISD::SELECT,             MMXTy,      Expand);
00768     setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
00769   }
00770   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00771 
00772   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00773     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00774 
00775     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00776     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00777     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00778     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00779     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00780     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00781     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00782     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00783     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00784     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00785     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
00786     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00787     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00788     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00789   }
00790 
00791   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00792     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00793 
00794     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00795     // registers cannot be used even for integer operations.
00796     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00797     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00798     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00799     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00800 
00801     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00802     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00803     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00804     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00805     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00806     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00807     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00808     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00809     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00810     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00811     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00812     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00813     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00814     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00815     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00816     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00817     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00818     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00819     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00820     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00821     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00822     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00823 
00824     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00825     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00826     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00827     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00828 
00829     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00830     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00831     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00832     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00833     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00834 
00835     // Only provide customized ctpop vector bit twiddling for vector types we
00836     // know to perform better than using the popcnt instructions on each vector
00837     // element. If popcnt isn't supported, always provide the custom version.
00838     if (!Subtarget->hasPOPCNT()) {
00839       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
00840       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
00841     }
00842 
00843     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00844     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00845       MVT VT = (MVT::SimpleValueType)i;
00846       // Do not attempt to custom lower non-power-of-2 vectors
00847       if (!isPowerOf2_32(VT.getVectorNumElements()))
00848         continue;
00849       // Do not attempt to custom lower non-128-bit vectors
00850       if (!VT.is128BitVector())
00851         continue;
00852       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00853       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00854       setOperationAction(ISD::VSELECT,            VT, Custom);
00855       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00856     }
00857 
00858     // We support custom legalizing of sext and anyext loads for specific
00859     // memory vector types which we can load as a scalar (or sequence of
00860     // scalars) and extend in-register to a legal 128-bit vector type. For sext
00861     // loads these must work with a single scalar load.
00862     for (MVT VT : MVT::integer_vector_valuetypes()) {
00863       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
00864       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
00865       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
00866       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
00867       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
00868       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
00869       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
00870       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
00871       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
00872     }
00873 
00874     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
00875     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
00876     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
00877     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
00878     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
00879     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
00880     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
00881     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
00882 
00883     if (Subtarget->is64Bit()) {
00884       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00885       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00886     }
00887 
00888     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
00889     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00890       MVT VT = (MVT::SimpleValueType)i;
00891 
00892       // Do not attempt to promote non-128-bit vectors
00893       if (!VT.is128BitVector())
00894         continue;
00895 
00896       setOperationAction(ISD::AND,    VT, Promote);
00897       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
00898       setOperationAction(ISD::OR,     VT, Promote);
00899       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
00900       setOperationAction(ISD::XOR,    VT, Promote);
00901       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
00902       setOperationAction(ISD::LOAD,   VT, Promote);
00903       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
00904       setOperationAction(ISD::SELECT, VT, Promote);
00905       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
00906     }
00907 
00908     // Custom lower v2i64 and v2f64 selects.
00909     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
00910     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
00911     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
00912     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
00913 
00914     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
00915     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
00916 
00917     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
00918     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
00919     // As there is no 64-bit GPR available, we need build a special custom
00920     // sequence to convert from v2i32 to v2f32.
00921     if (!Subtarget->is64Bit())
00922       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
00923 
00924     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
00925     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
00926 
00927     for (MVT VT : MVT::fp_vector_valuetypes())
00928       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
00929 
00930     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
00931     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
00932     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
00933   }
00934 
00935   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
00936     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
00937       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
00938       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
00939       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
00940       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
00941       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
00942     }
00943 
00944     // FIXME: Do we need to handle scalar-to-vector here?
00945     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
00946 
00947     // We directly match byte blends in the backend as they match the VSELECT
00948     // condition form.
00949     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
00950 
00951     // SSE41 brings specific instructions for doing vector sign extend even in
00952     // cases where we don't have SRA.
00953     for (MVT VT : MVT::integer_vector_valuetypes()) {
00954       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
00955       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
00956       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
00957     }
00958 
00959     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
00960     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
00961     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
00962     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
00963     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
00964     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
00965     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
00966 
00967     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
00968     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
00969     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
00970     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
00971     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
00972     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
00973 
00974     // i8 and i16 vectors are custom because the source register and source
00975     // source memory operand types are not the same width.  f32 vectors are
00976     // custom since the immediate controlling the insert encodes additional
00977     // information.
00978     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
00979     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00981     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00982 
00983     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
00984     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
00985     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
00986     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00987 
00988     // FIXME: these should be Legal, but that's only for the case where
00989     // the index is constant.  For now custom expand to deal with that.
00990     if (Subtarget->is64Bit()) {
00991       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00992       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00993     }
00994   }
00995 
00996   if (Subtarget->hasSSE2()) {
00997     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
00998     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
00999 
01000     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01001     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01002 
01003     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01004     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01005 
01006     // In the customized shift lowering, the legal cases in AVX2 will be
01007     // recognized.
01008     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01009     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01010 
01011     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01012     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01013 
01014     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01015   }
01016 
01017   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01018     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01019     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01020     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01021     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01022     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01023     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01024 
01025     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01026     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01027     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01028 
01029     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01030     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01031     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01032     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01033     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01034     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01035     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01036     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01037     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01038     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01039     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01040     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01041 
01042     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01043     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01044     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01045     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01046     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01047     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01048     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01049     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01050     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01051     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01052     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01053     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01054 
01055     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01056     // even though v8i16 is a legal type.
01057     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01058     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01059     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01060 
01061     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01062     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01063     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01064 
01065     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01066     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01067 
01068     for (MVT VT : MVT::fp_vector_valuetypes())
01069       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
01070 
01071     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01072     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01073 
01074     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01075     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01076 
01077     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01078     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01079 
01080     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01081     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01082     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01083     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01084 
01085     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01086     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01087     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01088 
01089     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01090     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01091     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01092     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01093     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01094     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01095     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01096     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01097     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01098     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01099     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01100     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01101 
01102     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01103       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01104       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01105       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01106       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01107       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01108       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01109     }
01110 
01111     if (Subtarget->hasInt256()) {
01112       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01113       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01114       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01115       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01116 
01117       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01118       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01119       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01120       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01121 
01122       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01123       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01124       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01125       // Don't lower v32i8 because there is no 128-bit byte mul
01126 
01127       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01128       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01129       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01130       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01131 
01132       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01133       // when we have a 256bit-wide blend with immediate.
01134       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01135 
01136       // Only provide customized ctpop vector bit twiddling for vector types we
01137       // know to perform better than using the popcnt instructions on each
01138       // vector element. If popcnt isn't supported, always provide the custom
01139       // version.
01140       if (!Subtarget->hasPOPCNT())
01141         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
01142 
01143       // Custom CTPOP always performs better on natively supported v8i32
01144       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
01145 
01146       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
01147       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01148       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01149       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01150       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01151       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01152       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01153 
01154       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01155       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01156       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01157       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01158       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01159       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01160     } else {
01161       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01162       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01163       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01164       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01165 
01166       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01167       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01168       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01169       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01170 
01171       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01172       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01173       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01174       // Don't lower v32i8 because there is no 128-bit byte mul
01175     }
01176 
01177     // In the customized shift lowering, the legal cases in AVX2 will be
01178     // recognized.
01179     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01180     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01181 
01182     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01183     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01184 
01185     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01186 
01187     // Custom lower several nodes for 256-bit types.
01188     for (MVT VT : MVT::vector_valuetypes()) {
01189       if (VT.getScalarSizeInBits() >= 32) {
01190         setOperationAction(ISD::MLOAD,  VT, Legal);
01191         setOperationAction(ISD::MSTORE, VT, Legal);
01192       }
01193       // Extract subvector is special because the value type
01194       // (result) is 128-bit but the source is 256-bit wide.
01195       if (VT.is128BitVector()) {
01196         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01197       }
01198       // Do not attempt to custom lower other non-256-bit vectors
01199       if (!VT.is256BitVector())
01200         continue;
01201 
01202       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01203       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01204       setOperationAction(ISD::VSELECT,            VT, Custom);
01205       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01206       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01207       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01208       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01209       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01210     }
01211 
01212     if (Subtarget->hasInt256())
01213       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01214 
01215 
01216     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01217     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01218       MVT VT = (MVT::SimpleValueType)i;
01219 
01220       // Do not attempt to promote non-256-bit vectors
01221       if (!VT.is256BitVector())
01222         continue;
01223 
01224       setOperationAction(ISD::AND,    VT, Promote);
01225       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01226       setOperationAction(ISD::OR,     VT, Promote);
01227       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01228       setOperationAction(ISD::XOR,    VT, Promote);
01229       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01230       setOperationAction(ISD::LOAD,   VT, Promote);
01231       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01232       setOperationAction(ISD::SELECT, VT, Promote);
01233       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01234     }
01235   }
01236 
01237   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01238     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01239     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01240     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01241     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01242 
01243     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01244     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01245     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01246 
01247     for (MVT VT : MVT::fp_vector_valuetypes())
01248       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
01249 
01250     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01251     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01252     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01253     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01254     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01255     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01256     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01257     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01258     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01259     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01260 
01261     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01262     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01263     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01264     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01265     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01266     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01267 
01268     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01269     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01270     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01271     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01272     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01273     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01274     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01275     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01276 
01277     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01278     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01279     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01280     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01281     if (Subtarget->is64Bit()) {
01282       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01283       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01284       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01285       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01286     }
01287     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01288     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01289     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01290     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01291     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01292     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01293     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01294     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01295     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01296     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01297     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01298     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01299     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01300     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01301 
01302     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01303     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01304     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01305     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01306     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01307     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01308     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01309     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01310     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01311     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01312     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01313     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01314     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01315 
01316     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
01317     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
01318     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
01319     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
01320     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
01321     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
01322     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
01323     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
01324     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
01325     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
01326 
01327     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01328     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01329     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01330     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01331     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01332 
01333     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01334     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01335 
01336     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01337 
01338     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01339     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01340     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01341     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01342     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01343     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01344     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01345     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01346     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01347 
01348     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01349     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01350 
01351     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01352     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01353 
01354     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01355 
01356     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01357     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01358 
01359     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01360     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01361 
01362     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01363     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01364 
01365     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01366     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01367     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01368     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01369     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01370     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01371 
01372     if (Subtarget->hasCDI()) {
01373       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01374       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01375     }
01376     if (Subtarget->hasDQI()) {
01377       setOperationAction(ISD::MUL,             MVT::v2i64, Legal);
01378       setOperationAction(ISD::MUL,             MVT::v4i64, Legal);
01379       setOperationAction(ISD::MUL,             MVT::v8i64, Legal);
01380     }
01381     // Custom lower several nodes.
01382     for (MVT VT : MVT::vector_valuetypes()) {
01383       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01384       // Extract subvector is special because the value type
01385       // (result) is 256/128-bit but the source is 512-bit wide.
01386       if (VT.is128BitVector() || VT.is256BitVector()) {
01387         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01388       }
01389       if (VT.getVectorElementType() == MVT::i1)
01390         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01391 
01392       // Do not attempt to custom lower other non-512-bit vectors
01393       if (!VT.is512BitVector())
01394         continue;
01395 
01396       if ( EltSize >= 32) {
01397         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01398         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01399         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01400         setOperationAction(ISD::VSELECT,             VT, Legal);
01401         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01402         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01403         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01404         setOperationAction(ISD::MLOAD,               VT, Legal);
01405         setOperationAction(ISD::MSTORE,              VT, Legal);
01406       }
01407     }
01408     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01409       MVT VT = (MVT::SimpleValueType)i;
01410 
01411       // Do not attempt to promote non-512-bit vectors.
01412       if (!VT.is512BitVector())
01413         continue;
01414 
01415       setOperationAction(ISD::SELECT, VT, Promote);
01416       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01417     }
01418   }// has  AVX-512
01419 
01420   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01421     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01422     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01423 
01424     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01425     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01426 
01427     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01428     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01429     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01430     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01431     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
01432     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
01433     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
01434     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
01435     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
01436     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
01437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
01438     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
01439     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
01440 
01441     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01442       const MVT VT = (MVT::SimpleValueType)i;
01443 
01444       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01445 
01446       // Do not attempt to promote non-512-bit vectors.
01447       if (!VT.is512BitVector())
01448         continue;
01449 
01450       if (EltSize < 32) {
01451         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01452         setOperationAction(ISD::VSELECT,             VT, Legal);
01453       }
01454     }
01455   }
01456 
01457   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01458     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01459     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01460 
01461     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01462     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01463     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
01464     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
01465     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
01466     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
01467 
01468     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
01469     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
01470     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
01471     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
01472     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
01473     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
01474   }
01475 
01476   // We want to custom lower some of our intrinsics.
01477   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01478   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01479   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01480   if (!Subtarget->is64Bit())
01481     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01482 
01483   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01484   // handle type legalization for these operations here.
01485   //
01486   // FIXME: We really should do custom legalization for addition and
01487   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01488   // than generic legalization for 64-bit multiplication-with-overflow, though.
01489   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01490     // Add/Sub/Mul with overflow operations are custom lowered.
01491     MVT VT = IntVTs[i];
01492     setOperationAction(ISD::SADDO, VT, Custom);
01493     setOperationAction(ISD::UADDO, VT, Custom);
01494     setOperationAction(ISD::SSUBO, VT, Custom);
01495     setOperationAction(ISD::USUBO, VT, Custom);
01496     setOperationAction(ISD::SMULO, VT, Custom);
01497     setOperationAction(ISD::UMULO, VT, Custom);
01498   }
01499 
01500 
01501   if (!Subtarget->is64Bit()) {
01502     // These libcalls are not available in 32-bit.
01503     setLibcallName(RTLIB::SHL_I128, nullptr);
01504     setLibcallName(RTLIB::SRL_I128, nullptr);
01505     setLibcallName(RTLIB::SRA_I128, nullptr);
01506   }
01507 
01508   // Combine sin / cos into one node or libcall if possible.
01509   if (Subtarget->hasSinCos()) {
01510     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01511     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01512     if (Subtarget->isTargetDarwin()) {
01513       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
01514       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
01515       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01516       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01517     }
01518   }
01519 
01520   if (Subtarget->isTargetWin64()) {
01521     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01522     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01523     setOperationAction(ISD::SREM, MVT::i128, Custom);
01524     setOperationAction(ISD::UREM, MVT::i128, Custom);
01525     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01526     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01527   }
01528 
01529   // We have target-specific dag combine patterns for the following nodes:
01530   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01531   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01532   setTargetDAGCombine(ISD::BITCAST);
01533   setTargetDAGCombine(ISD::VSELECT);
01534   setTargetDAGCombine(ISD::SELECT);
01535   setTargetDAGCombine(ISD::SHL);
01536   setTargetDAGCombine(ISD::SRA);
01537   setTargetDAGCombine(ISD::SRL);
01538   setTargetDAGCombine(ISD::OR);
01539   setTargetDAGCombine(ISD::AND);
01540   setTargetDAGCombine(ISD::ADD);
01541   setTargetDAGCombine(ISD::FADD);
01542   setTargetDAGCombine(ISD::FSUB);
01543   setTargetDAGCombine(ISD::FMA);
01544   setTargetDAGCombine(ISD::SUB);
01545   setTargetDAGCombine(ISD::LOAD);
01546   setTargetDAGCombine(ISD::MLOAD);
01547   setTargetDAGCombine(ISD::STORE);
01548   setTargetDAGCombine(ISD::MSTORE);
01549   setTargetDAGCombine(ISD::ZERO_EXTEND);
01550   setTargetDAGCombine(ISD::ANY_EXTEND);
01551   setTargetDAGCombine(ISD::SIGN_EXTEND);
01552   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01553   setTargetDAGCombine(ISD::TRUNCATE);
01554   setTargetDAGCombine(ISD::SINT_TO_FP);
01555   setTargetDAGCombine(ISD::SETCC);
01556   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01557   setTargetDAGCombine(ISD::BUILD_VECTOR);
01558   setTargetDAGCombine(ISD::MUL);
01559   setTargetDAGCombine(ISD::XOR);
01560 
01561   computeRegisterProperties(Subtarget->getRegisterInfo());
01562 
01563   // On Darwin, -Os means optimize for size without hurting performance,
01564   // do not reduce the limit.
01565   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01566   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01567   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01568   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01569   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01570   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01571   setPrefLoopAlignment(4); // 2^4 bytes.
01572 
01573   // Predictable cmov don't hurt on atom because it's in-order.
01574   PredictableSelectIsExpensive = !Subtarget->isAtom();
01575   EnableExtLdPromotion = true;
01576   setPrefFunctionAlignment(4); // 2^4 bytes.
01577 
01578   verifyIntrinsicTables();
01579 }
01580 
01581 // This has so far only been implemented for 64-bit MachO.
01582 bool X86TargetLowering::useLoadStackGuardNode() const {
01583   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
01584 }
01585 
01586 TargetLoweringBase::LegalizeTypeAction
01587 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01588   if (ExperimentalVectorWideningLegalization &&
01589       VT.getVectorNumElements() != 1 &&
01590       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01591     return TypeWidenVector;
01592 
01593   return TargetLoweringBase::getPreferredVectorAction(VT);
01594 }
01595 
01596 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01597   if (!VT.isVector())
01598     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01599 
01600   const unsigned NumElts = VT.getVectorNumElements();
01601   const EVT EltVT = VT.getVectorElementType();
01602   if (VT.is512BitVector()) {
01603     if (Subtarget->hasAVX512())
01604       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01605           EltVT == MVT::f32 || EltVT == MVT::f64)
01606         switch(NumElts) {
01607         case  8: return MVT::v8i1;
01608         case 16: return MVT::v16i1;
01609       }
01610     if (Subtarget->hasBWI())
01611       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01612         switch(NumElts) {
01613         case 32: return MVT::v32i1;
01614         case 64: return MVT::v64i1;
01615       }
01616   }
01617 
01618   if (VT.is256BitVector() || VT.is128BitVector()) {
01619     if (Subtarget->hasVLX())
01620       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01621           EltVT == MVT::f32 || EltVT == MVT::f64)
01622         switch(NumElts) {
01623         case 2: return MVT::v2i1;
01624         case 4: return MVT::v4i1;
01625         case 8: return MVT::v8i1;
01626       }
01627     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01628       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01629         switch(NumElts) {
01630         case  8: return MVT::v8i1;
01631         case 16: return MVT::v16i1;
01632         case 32: return MVT::v32i1;
01633       }
01634   }
01635 
01636   return VT.changeVectorElementTypeToInteger();
01637 }
01638 
01639 /// Helper for getByValTypeAlignment to determine
01640 /// the desired ByVal argument alignment.
01641 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01642   if (MaxAlign == 16)
01643     return;
01644   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01645     if (VTy->getBitWidth() == 128)
01646       MaxAlign = 16;
01647   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01648     unsigned EltAlign = 0;
01649     getMaxByValAlign(ATy->getElementType(), EltAlign);
01650     if (EltAlign > MaxAlign)
01651       MaxAlign = EltAlign;
01652   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01653     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01654       unsigned EltAlign = 0;
01655       getMaxByValAlign(STy->getElementType(i), EltAlign);
01656       if (EltAlign > MaxAlign)
01657         MaxAlign = EltAlign;
01658       if (MaxAlign == 16)
01659         break;
01660     }
01661   }
01662 }
01663 
01664 /// Return the desired alignment for ByVal aggregate
01665 /// function arguments in the caller parameter area. For X86, aggregates
01666 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01667 /// are at 4-byte boundaries.
01668 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01669   if (Subtarget->is64Bit()) {
01670     // Max of 8 and alignment of type.
01671     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01672     if (TyAlign > 8)
01673       return TyAlign;
01674     return 8;
01675   }
01676 
01677   unsigned Align = 4;
01678   if (Subtarget->hasSSE1())
01679     getMaxByValAlign(Ty, Align);
01680   return Align;
01681 }
01682 
01683 /// Returns the target specific optimal type for load
01684 /// and store operations as a result of memset, memcpy, and memmove
01685 /// lowering. If DstAlign is zero that means it's safe to destination
01686 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01687 /// means there isn't a need to check it against alignment requirement,
01688 /// probably because the source does not need to be loaded. If 'IsMemset' is
01689 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01690 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01691 /// source is constant so it does not need to be loaded.
01692 /// It returns EVT::Other if the type should be determined using generic
01693 /// target-independent logic.
01694 EVT
01695 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01696                                        unsigned DstAlign, unsigned SrcAlign,
01697                                        bool IsMemset, bool ZeroMemset,
01698                                        bool MemcpyStrSrc,
01699                                        MachineFunction &MF) const {
01700   const Function *F = MF.getFunction();
01701   if ((!IsMemset || ZeroMemset) &&
01702       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01703     if (Size >= 16 &&
01704         (Subtarget->isUnalignedMemAccessFast() ||
01705          ((DstAlign == 0 || DstAlign >= 16) &&
01706           (SrcAlign == 0 || SrcAlign >= 16)))) {
01707       if (Size >= 32) {
01708         if (Subtarget->hasInt256())
01709           return MVT::v8i32;
01710         if (Subtarget->hasFp256())
01711           return MVT::v8f32;
01712       }
01713       if (Subtarget->hasSSE2())
01714         return MVT::v4i32;
01715       if (Subtarget->hasSSE1())
01716         return MVT::v4f32;
01717     } else if (!MemcpyStrSrc && Size >= 8 &&
01718                !Subtarget->is64Bit() &&
01719                Subtarget->hasSSE2()) {
01720       // Do not use f64 to lower memcpy if source is string constant. It's
01721       // better to use i32 to avoid the loads.
01722       return MVT::f64;
01723     }
01724   }
01725   if (Subtarget->is64Bit() && Size >= 8)
01726     return MVT::i64;
01727   return MVT::i32;
01728 }
01729 
01730 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01731   if (VT == MVT::f32)
01732     return X86ScalarSSEf32;
01733   else if (VT == MVT::f64)
01734     return X86ScalarSSEf64;
01735   return true;
01736 }
01737 
01738 bool
01739 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01740                                                   unsigned,
01741                                                   unsigned,
01742                                                   bool *Fast) const {
01743   if (Fast)
01744     *Fast = Subtarget->isUnalignedMemAccessFast();
01745   return true;
01746 }
01747 
01748 /// Return the entry encoding for a jump table in the
01749 /// current function.  The returned value is a member of the
01750 /// MachineJumpTableInfo::JTEntryKind enum.
01751 unsigned X86TargetLowering::getJumpTableEncoding() const {
01752   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01753   // symbol.
01754   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01755       Subtarget->isPICStyleGOT())
01756     return MachineJumpTableInfo::EK_Custom32;
01757 
01758   // Otherwise, use the normal jump table encoding heuristics.
01759   return TargetLowering::getJumpTableEncoding();
01760 }
01761 
01762 const MCExpr *
01763 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01764                                              const MachineBasicBlock *MBB,
01765                                              unsigned uid,MCContext &Ctx) const{
01766   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01767          Subtarget->isPICStyleGOT());
01768   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01769   // entries.
01770   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01771                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01772 }
01773 
01774 /// Returns relocation base for the given PIC jumptable.
01775 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01776                                                     SelectionDAG &DAG) const {
01777   if (!Subtarget->is64Bit())
01778     // This doesn't have SDLoc associated with it, but is not really the
01779     // same as a Register.
01780     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01781   return Table;
01782 }
01783 
01784 /// This returns the relocation base for the given PIC jumptable,
01785 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
01786 const MCExpr *X86TargetLowering::
01787 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01788                              MCContext &Ctx) const {
01789   // X86-64 uses RIP relative addressing based on the jump table label.
01790   if (Subtarget->isPICStyleRIPRel())
01791     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01792 
01793   // Otherwise, the reference is relative to the PIC base.
01794   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01795 }
01796 
01797 std::pair<const TargetRegisterClass *, uint8_t>
01798 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
01799                                            MVT VT) const {
01800   const TargetRegisterClass *RRC = nullptr;
01801   uint8_t Cost = 1;
01802   switch (VT.SimpleTy) {
01803   default:
01804     return TargetLowering::findRepresentativeClass(TRI, VT);
01805   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01806     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01807     break;
01808   case MVT::x86mmx:
01809     RRC = &X86::VR64RegClass;
01810     break;
01811   case MVT::f32: case MVT::f64:
01812   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01813   case MVT::v4f32: case MVT::v2f64:
01814   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01815   case MVT::v4f64:
01816     RRC = &X86::VR128RegClass;
01817     break;
01818   }
01819   return std::make_pair(RRC, Cost);
01820 }
01821 
01822 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01823                                                unsigned &Offset) const {
01824   if (!Subtarget->isTargetLinux())
01825     return false;
01826 
01827   if (Subtarget->is64Bit()) {
01828     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01829     Offset = 0x28;
01830     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01831       AddressSpace = 256;
01832     else
01833       AddressSpace = 257;
01834   } else {
01835     // %gs:0x14 on i386
01836     Offset = 0x14;
01837     AddressSpace = 256;
01838   }
01839   return true;
01840 }
01841 
01842 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01843                                             unsigned DestAS) const {
01844   assert(SrcAS != DestAS && "Expected different address spaces!");
01845 
01846   return SrcAS < 256 && DestAS < 256;
01847 }
01848 
01849 //===----------------------------------------------------------------------===//
01850 //               Return Value Calling Convention Implementation
01851 //===----------------------------------------------------------------------===//
01852 
01853 #include "X86GenCallingConv.inc"
01854 
01855 bool
01856 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01857                                   MachineFunction &MF, bool isVarArg,
01858                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01859                         LLVMContext &Context) const {
01860   SmallVector<CCValAssign, 16> RVLocs;
01861   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01862   return CCInfo.CheckReturn(Outs, RetCC_X86);
01863 }
01864 
01865 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01866   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01867   return ScratchRegs;
01868 }
01869 
01870 SDValue
01871 X86TargetLowering::LowerReturn(SDValue Chain,
01872                                CallingConv::ID CallConv, bool isVarArg,
01873                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01874                                const SmallVectorImpl<SDValue> &OutVals,
01875                                SDLoc dl, SelectionDAG &DAG) const {
01876   MachineFunction &MF = DAG.getMachineFunction();
01877   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01878 
01879   SmallVector<CCValAssign, 16> RVLocs;
01880   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01881   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01882 
01883   SDValue Flag;
01884   SmallVector<SDValue, 6> RetOps;
01885   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01886   // Operand #1 = Bytes To Pop
01887   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01888                    MVT::i16));
01889 
01890   // Copy the result values into the output registers.
01891   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01892     CCValAssign &VA = RVLocs[i];
01893     assert(VA.isRegLoc() && "Can only return in registers!");
01894     SDValue ValToCopy = OutVals[i];
01895     EVT ValVT = ValToCopy.getValueType();
01896 
01897     // Promote values to the appropriate types.
01898     if (VA.getLocInfo() == CCValAssign::SExt)
01899       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01900     else if (VA.getLocInfo() == CCValAssign::ZExt)
01901       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01902     else if (VA.getLocInfo() == CCValAssign::AExt)
01903       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01904     else if (VA.getLocInfo() == CCValAssign::BCvt)
01905       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01906 
01907     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01908            "Unexpected FP-extend for return value.");
01909 
01910     // If this is x86-64, and we disabled SSE, we can't return FP values,
01911     // or SSE or MMX vectors.
01912     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01913          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01914           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01915       report_fatal_error("SSE register return with SSE disabled");
01916     }
01917     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01918     // llvm-gcc has never done it right and no one has noticed, so this
01919     // should be OK for now.
01920     if (ValVT == MVT::f64 &&
01921         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01922       report_fatal_error("SSE2 register return with SSE2 disabled");
01923 
01924     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01925     // the RET instruction and handled by the FP Stackifier.
01926     if (VA.getLocReg() == X86::FP0 ||
01927         VA.getLocReg() == X86::FP1) {
01928       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01929       // change the value to the FP stack register class.
01930       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01931         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01932       RetOps.push_back(ValToCopy);
01933       // Don't emit a copytoreg.
01934       continue;
01935     }
01936 
01937     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01938     // which is returned in RAX / RDX.
01939     if (Subtarget->is64Bit()) {
01940       if (ValVT == MVT::x86mmx) {
01941         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01942           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01943           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01944                                   ValToCopy);
01945           // If we don't have SSE2 available, convert to v4f32 so the generated
01946           // register is legal.
01947           if (!Subtarget->hasSSE2())
01948             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01949         }
01950       }
01951     }
01952 
01953     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01954     Flag = Chain.getValue(1);
01955     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01956   }
01957 
01958   // The x86-64 ABIs require that for returning structs by value we copy
01959   // the sret argument into %rax/%eax (depending on ABI) for the return.
01960   // Win32 requires us to put the sret argument to %eax as well.
01961   // We saved the argument into a virtual register in the entry block,
01962   // so now we copy the value out and into %rax/%eax.
01963   //
01964   // Checking Function.hasStructRetAttr() here is insufficient because the IR
01965   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
01966   // false, then an sret argument may be implicitly inserted in the SelDAG. In
01967   // either case FuncInfo->setSRetReturnReg() will have been called.
01968   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
01969     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
01970            "No need for an sret register");
01971     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
01972 
01973     unsigned RetValReg
01974         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01975           X86::RAX : X86::EAX;
01976     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01977     Flag = Chain.getValue(1);
01978 
01979     // RAX/EAX now acts like a return value.
01980     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01981   }
01982 
01983   RetOps[0] = Chain;  // Update chain.
01984 
01985   // Add the flag if we have it.
01986   if (Flag.getNode())
01987     RetOps.push_back(Flag);
01988 
01989   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
01990 }
01991 
01992 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
01993   if (N->getNumValues() != 1)
01994     return false;
01995   if (!N->hasNUsesOfValue(1, 0))
01996     return false;
01997 
01998   SDValue TCChain = Chain;
01999   SDNode *Copy = *N->use_begin();
02000   if (Copy->getOpcode() == ISD::CopyToReg) {
02001     // If the copy has a glue operand, we conservatively assume it isn't safe to
02002     // perform a tail call.
02003     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02004       return false;
02005     TCChain = Copy->getOperand(0);
02006   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02007     return false;
02008 
02009   bool HasRet = false;
02010   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02011        UI != UE; ++UI) {
02012     if (UI->getOpcode() != X86ISD::RET_FLAG)
02013       return false;
02014     // If we are returning more than one value, we can definitely
02015     // not make a tail call see PR19530
02016     if (UI->getNumOperands() > 4)
02017       return false;
02018     if (UI->getNumOperands() == 4 &&
02019         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02020       return false;
02021     HasRet = true;
02022   }
02023 
02024   if (!HasRet)
02025     return false;
02026 
02027   Chain = TCChain;
02028   return true;
02029 }
02030 
02031 EVT
02032 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02033                                             ISD::NodeType ExtendKind) const {
02034   MVT ReturnMVT;
02035   // TODO: Is this also valid on 32-bit?
02036   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02037     ReturnMVT = MVT::i8;
02038   else
02039     ReturnMVT = MVT::i32;
02040 
02041   EVT MinVT = getRegisterType(Context, ReturnMVT);
02042   return VT.bitsLT(MinVT) ? MinVT : VT;
02043 }
02044 
02045 /// Lower the result values of a call into the
02046 /// appropriate copies out of appropriate physical registers.
02047 ///
02048 SDValue
02049 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02050                                    CallingConv::ID CallConv, bool isVarArg,
02051                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02052                                    SDLoc dl, SelectionDAG &DAG,
02053                                    SmallVectorImpl<SDValue> &InVals) const {
02054 
02055   // Assign locations to each value returned by this call.
02056   SmallVector<CCValAssign, 16> RVLocs;
02057   bool Is64Bit = Subtarget->is64Bit();
02058   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02059                  *DAG.getContext());
02060   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02061 
02062   // Copy all of the result registers out of their specified physreg.
02063   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02064     CCValAssign &VA = RVLocs[i];
02065     EVT CopyVT = VA.getValVT();
02066 
02067     // If this is x86-64, and we disabled SSE, we can't return FP values
02068     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02069         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02070       report_fatal_error("SSE register return with SSE disabled");
02071     }
02072 
02073     // If we prefer to use the value in xmm registers, copy it out as f80 and
02074     // use a truncate to move it from fp stack reg to xmm reg.
02075     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02076         isScalarFPTypeInSSEReg(VA.getValVT()))
02077       CopyVT = MVT::f80;
02078 
02079     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02080                                CopyVT, InFlag).getValue(1);
02081     SDValue Val = Chain.getValue(0);
02082 
02083     if (CopyVT != VA.getValVT())
02084       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02085                         // This truncation won't change the value.
02086                         DAG.getIntPtrConstant(1));
02087 
02088     InFlag = Chain.getValue(2);
02089     InVals.push_back(Val);
02090   }
02091 
02092   return Chain;
02093 }
02094 
02095 //===----------------------------------------------------------------------===//
02096 //                C & StdCall & Fast Calling Convention implementation
02097 //===----------------------------------------------------------------------===//
02098 //  StdCall calling convention seems to be standard for many Windows' API
02099 //  routines and around. It differs from C calling convention just a little:
02100 //  callee should clean up the stack, not caller. Symbols should be also
02101 //  decorated in some fancy way :) It doesn't support any vector arguments.
02102 //  For info on fast calling convention see Fast Calling Convention (tail call)
02103 //  implementation LowerX86_32FastCCCallTo.
02104 
02105 /// CallIsStructReturn - Determines whether a call uses struct return
02106 /// semantics.
02107 enum StructReturnType {
02108   NotStructReturn,
02109   RegStructReturn,
02110   StackStructReturn
02111 };
02112 static StructReturnType
02113 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02114   if (Outs.empty())
02115     return NotStructReturn;
02116 
02117   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02118   if (!Flags.isSRet())
02119     return NotStructReturn;
02120   if (Flags.isInReg())
02121     return RegStructReturn;
02122   return StackStructReturn;
02123 }
02124 
02125 /// Determines whether a function uses struct return semantics.
02126 static StructReturnType
02127 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02128   if (Ins.empty())
02129     return NotStructReturn;
02130 
02131   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02132   if (!Flags.isSRet())
02133     return NotStructReturn;
02134   if (Flags.isInReg())
02135     return RegStructReturn;
02136   return StackStructReturn;
02137 }
02138 
02139 /// Make a copy of an aggregate at address specified by "Src" to address
02140 /// "Dst" with size and alignment information specified by the specific
02141 /// parameter attribute. The copy will be passed as a byval function parameter.
02142 static SDValue
02143 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02144                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02145                           SDLoc dl) {
02146   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02147 
02148   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02149                        /*isVolatile*/false, /*AlwaysInline=*/true,
02150                        /*isTailCall*/false,
02151                        MachinePointerInfo(), MachinePointerInfo());
02152 }
02153 
02154 /// Return true if the calling convention is one that
02155 /// supports tail call optimization.
02156 static bool IsTailCallConvention(CallingConv::ID CC) {
02157   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02158           CC == CallingConv::HiPE);
02159 }
02160 
02161 /// \brief Return true if the calling convention is a C calling convention.
02162 static bool IsCCallConvention(CallingConv::ID CC) {
02163   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02164           CC == CallingConv::X86_64_SysV);
02165 }
02166 
02167 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02168   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02169     return false;
02170 
02171   CallSite CS(CI);
02172   CallingConv::ID CalleeCC = CS.getCallingConv();
02173   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02174     return false;
02175 
02176   return true;
02177 }
02178 
02179 /// Return true if the function is being made into
02180 /// a tailcall target by changing its ABI.
02181 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02182                                    bool GuaranteedTailCallOpt) {
02183   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02184 }
02185 
02186 SDValue
02187 X86TargetLowering::LowerMemArgument(SDValue Chain,
02188                                     CallingConv::ID CallConv,
02189                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02190                                     SDLoc dl, SelectionDAG &DAG,
02191                                     const CCValAssign &VA,
02192                                     MachineFrameInfo *MFI,
02193                                     unsigned i) const {
02194   // Create the nodes corresponding to a load from this parameter slot.
02195   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02196   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02197       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02198   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02199   EVT ValVT;
02200 
02201   // If value is passed by pointer we have address passed instead of the value
02202   // itself.
02203   if (VA.getLocInfo() == CCValAssign::Indirect)
02204     ValVT = VA.getLocVT();
02205   else
02206     ValVT = VA.getValVT();
02207 
02208   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02209   // changed with more analysis.
02210   // In case of tail call optimization mark all arguments mutable. Since they
02211   // could be overwritten by lowering of arguments in case of a tail call.
02212   if (Flags.isByVal()) {
02213     unsigned Bytes = Flags.getByValSize();
02214     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02215     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02216     return DAG.getFrameIndex(FI, getPointerTy());
02217   } else {
02218     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02219                                     VA.getLocMemOffset(), isImmutable);
02220     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02221     return DAG.getLoad(ValVT, dl, Chain, FIN,
02222                        MachinePointerInfo::getFixedStack(FI),
02223                        false, false, false, 0);
02224   }
02225 }
02226 
02227 // FIXME: Get this from tablegen.
02228 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02229                                                 const X86Subtarget *Subtarget) {
02230   assert(Subtarget->is64Bit());
02231 
02232   if (Subtarget->isCallingConvWin64(CallConv)) {
02233     static const MCPhysReg GPR64ArgRegsWin64[] = {
02234       X86::RCX, X86::RDX, X86::R8,  X86::R9
02235     };
02236     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02237   }
02238 
02239   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02240     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02241   };
02242   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02243 }
02244 
02245 // FIXME: Get this from tablegen.
02246 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02247                                                 CallingConv::ID CallConv,
02248                                                 const X86Subtarget *Subtarget) {
02249   assert(Subtarget->is64Bit());
02250   if (Subtarget->isCallingConvWin64(CallConv)) {
02251     // The XMM registers which might contain var arg parameters are shadowed
02252     // in their paired GPR.  So we only need to save the GPR to their home
02253     // slots.
02254     // TODO: __vectorcall will change this.
02255     return None;
02256   }
02257 
02258   const Function *Fn = MF.getFunction();
02259   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
02260   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02261          "SSE register cannot be used when SSE is disabled!");
02262   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02263       !Subtarget->hasSSE1())
02264     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02265     // registers.
02266     return None;
02267 
02268   static const MCPhysReg XMMArgRegs64Bit[] = {
02269     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02270     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02271   };
02272   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02273 }
02274 
02275 SDValue
02276 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02277                                         CallingConv::ID CallConv,
02278                                         bool isVarArg,
02279                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02280                                         SDLoc dl,
02281                                         SelectionDAG &DAG,
02282                                         SmallVectorImpl<SDValue> &InVals)
02283                                           const {
02284   MachineFunction &MF = DAG.getMachineFunction();
02285   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02286   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
02287 
02288   const Function* Fn = MF.getFunction();
02289   if (Fn->hasExternalLinkage() &&
02290       Subtarget->isTargetCygMing() &&
02291       Fn->getName() == "main")
02292     FuncInfo->setForceFramePointer(true);
02293 
02294   MachineFrameInfo *MFI = MF.getFrameInfo();
02295   bool Is64Bit = Subtarget->is64Bit();
02296   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02297 
02298   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02299          "Var args not supported with calling convention fastcc, ghc or hipe");
02300 
02301   // Assign locations to all of the incoming arguments.
02302   SmallVector<CCValAssign, 16> ArgLocs;
02303   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02304 
02305   // Allocate shadow area for Win64
02306   if (IsWin64)
02307     CCInfo.AllocateStack(32, 8);
02308 
02309   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02310 
02311   unsigned LastVal = ~0U;
02312   SDValue ArgValue;
02313   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02314     CCValAssign &VA = ArgLocs[i];
02315     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02316     // places.
02317     assert(VA.getValNo() != LastVal &&
02318            "Don't support value assigned to multiple locs yet");
02319     (void)LastVal;
02320     LastVal = VA.getValNo();
02321 
02322     if (VA.isRegLoc()) {
02323       EVT RegVT = VA.getLocVT();
02324       const TargetRegisterClass *RC;
02325       if (RegVT == MVT::i32)
02326         RC = &X86::GR32RegClass;
02327       else if (Is64Bit && RegVT == MVT::i64)
02328         RC = &X86::GR64RegClass;
02329       else if (RegVT == MVT::f32)
02330         RC = &X86::FR32RegClass;
02331       else if (RegVT == MVT::f64)
02332         RC = &X86::FR64RegClass;
02333       else if (RegVT.is512BitVector())
02334         RC = &X86::VR512RegClass;
02335       else if (RegVT.is256BitVector())
02336         RC = &X86::VR256RegClass;
02337       else if (RegVT.is128BitVector())
02338         RC = &X86::VR128RegClass;
02339       else if (RegVT == MVT::x86mmx)
02340         RC = &X86::VR64RegClass;
02341       else if (RegVT == MVT::i1)
02342         RC = &X86::VK1RegClass;
02343       else if (RegVT == MVT::v8i1)
02344         RC = &X86::VK8RegClass;
02345       else if (RegVT == MVT::v16i1)
02346         RC = &X86::VK16RegClass;
02347       else if (RegVT == MVT::v32i1)
02348         RC = &X86::VK32RegClass;
02349       else if (RegVT == MVT::v64i1)
02350         RC = &X86::VK64RegClass;
02351       else
02352         llvm_unreachable("Unknown argument type!");
02353 
02354       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02355       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02356 
02357       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02358       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02359       // right size.
02360       if (VA.getLocInfo() == CCValAssign::SExt)
02361         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02362                                DAG.getValueType(VA.getValVT()));
02363       else if (VA.getLocInfo() == CCValAssign::ZExt)
02364         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02365                                DAG.getValueType(VA.getValVT()));
02366       else if (VA.getLocInfo() == CCValAssign::BCvt)
02367         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02368 
02369       if (VA.isExtInLoc()) {
02370         // Handle MMX values passed in XMM regs.
02371         if (RegVT.isVector())
02372           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02373         else
02374           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02375       }
02376     } else {
02377       assert(VA.isMemLoc());
02378       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02379     }
02380 
02381     // If value is passed via pointer - do a load.
02382     if (VA.getLocInfo() == CCValAssign::Indirect)
02383       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02384                              MachinePointerInfo(), false, false, false, 0);
02385 
02386     InVals.push_back(ArgValue);
02387   }
02388 
02389   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02390     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02391       // The x86-64 ABIs require that for returning structs by value we copy
02392       // the sret argument into %rax/%eax (depending on ABI) for the return.
02393       // Win32 requires us to put the sret argument to %eax as well.
02394       // Save the argument into a virtual register so that we can access it
02395       // from the return points.
02396       if (Ins[i].Flags.isSRet()) {
02397         unsigned Reg = FuncInfo->getSRetReturnReg();
02398         if (!Reg) {
02399           MVT PtrTy = getPointerTy();
02400           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02401           FuncInfo->setSRetReturnReg(Reg);
02402         }
02403         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02404         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02405         break;
02406       }
02407     }
02408   }
02409 
02410   unsigned StackSize = CCInfo.getNextStackOffset();
02411   // Align stack specially for tail calls.
02412   if (FuncIsMadeTailCallSafe(CallConv,
02413                              MF.getTarget().Options.GuaranteedTailCallOpt))
02414     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02415 
02416   // If the function takes variable number of arguments, make a frame index for
02417   // the start of the first vararg value... for expansion of llvm.va_start. We
02418   // can skip this if there are no va_start calls.
02419   if (MFI->hasVAStart() &&
02420       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02421                    CallConv != CallingConv::X86_ThisCall))) {
02422     FuncInfo->setVarArgsFrameIndex(
02423         MFI->CreateFixedObject(1, StackSize, true));
02424   }
02425 
02426   MachineModuleInfo &MMI = MF.getMMI();
02427   const Function *WinEHParent = nullptr;
02428   if (IsWin64 && MMI.hasWinEHFuncInfo(Fn))
02429     WinEHParent = MMI.getWinEHParent(Fn);
02430   bool IsWinEHOutlined = WinEHParent && WinEHParent != Fn;
02431   bool IsWinEHParent = WinEHParent && WinEHParent == Fn;
02432 
02433   // Figure out if XMM registers are in use.
02434   assert(!(MF.getTarget().Options.UseSoftFloat &&
02435            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
02436          "SSE register cannot be used when SSE is disabled!");
02437 
02438   // 64-bit calling conventions support varargs and register parameters, so we
02439   // have to do extra work to spill them in the prologue.
02440   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
02441     // Find the first unallocated argument registers.
02442     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02443     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02444     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
02445     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
02446     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02447            "SSE register cannot be used when SSE is disabled!");
02448 
02449     // Gather all the live in physical registers.
02450     SmallVector<SDValue, 6> LiveGPRs;
02451     SmallVector<SDValue, 8> LiveXMMRegs;
02452     SDValue ALVal;
02453     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02454       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02455       LiveGPRs.push_back(
02456           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02457     }
02458     if (!ArgXMMs.empty()) {
02459       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02460       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02461       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02462         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02463         LiveXMMRegs.push_back(
02464             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02465       }
02466     }
02467 
02468     if (IsWin64) {
02469       // Get to the caller-allocated home save location.  Add 8 to account
02470       // for the return address.
02471       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02472       FuncInfo->setRegSaveFrameIndex(
02473           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02474       // Fixup to set vararg frame on shadow area (4 x i64).
02475       if (NumIntRegs < 4)
02476         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02477     } else {
02478       // For X86-64, if there are vararg parameters that are passed via
02479       // registers, then we must store them to their spots on the stack so
02480       // they may be loaded by deferencing the result of va_next.
02481       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02482       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02483       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02484           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02485     }
02486 
02487     // Store the integer parameter registers.
02488     SmallVector<SDValue, 8> MemOps;
02489     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02490                                       getPointerTy());
02491     unsigned Offset = FuncInfo->getVarArgsGPOffset();
02492     for (SDValue Val : LiveGPRs) {
02493       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02494                                 DAG.getIntPtrConstant(Offset));
02495       SDValue Store =
02496         DAG.getStore(Val.getValue(1), dl, Val, FIN,
02497                      MachinePointerInfo::getFixedStack(
02498                        FuncInfo->getRegSaveFrameIndex(), Offset),
02499                      false, false, 0);
02500       MemOps.push_back(Store);
02501       Offset += 8;
02502     }
02503 
02504     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02505       // Now store the XMM (fp + vector) parameter registers.
02506       SmallVector<SDValue, 12> SaveXMMOps;
02507       SaveXMMOps.push_back(Chain);
02508       SaveXMMOps.push_back(ALVal);
02509       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02510                              FuncInfo->getRegSaveFrameIndex()));
02511       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02512                              FuncInfo->getVarArgsFPOffset()));
02513       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02514                         LiveXMMRegs.end());
02515       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02516                                    MVT::Other, SaveXMMOps));
02517     }
02518 
02519     if (!MemOps.empty())
02520       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02521   } else if (IsWinEHOutlined) {
02522     // Get to the caller-allocated home save location.  Add 8 to account
02523     // for the return address.
02524     int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02525     FuncInfo->setRegSaveFrameIndex(MFI->CreateFixedObject(
02526         /*Size=*/1, /*SPOffset=*/HomeOffset + 8, /*Immutable=*/false));
02527 
02528     MMI.getWinEHFuncInfo(Fn)
02529         .CatchHandlerParentFrameObjIdx[const_cast<Function *>(Fn)] =
02530         FuncInfo->getRegSaveFrameIndex();
02531 
02532     // Store the second integer parameter (rdx) into rsp+16 relative to the
02533     // stack pointer at the entry of the function.
02534     SDValue RSFIN =
02535         DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy());
02536     unsigned GPR = MF.addLiveIn(X86::RDX, &X86::GR64RegClass);
02537     SDValue Val = DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64);
02538     Chain = DAG.getStore(
02539         Val.getValue(1), dl, Val, RSFIN,
02540         MachinePointerInfo::getFixedStack(FuncInfo->getRegSaveFrameIndex()),
02541         /*isVolatile=*/true, /*isNonTemporal=*/false, /*Alignment=*/0);
02542   }
02543 
02544   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
02545     // Find the largest legal vector type.
02546     MVT VecVT = MVT::Other;
02547     // FIXME: Only some x86_32 calling conventions support AVX512.
02548     if (Subtarget->hasAVX512() &&
02549         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
02550                      CallConv == CallingConv::Intel_OCL_BI)))
02551       VecVT = MVT::v16f32;
02552     else if (Subtarget->hasAVX())
02553       VecVT = MVT::v8f32;
02554     else if (Subtarget->hasSSE2())
02555       VecVT = MVT::v4f32;
02556 
02557     // We forward some GPRs and some vector types.
02558     SmallVector<MVT, 2> RegParmTypes;
02559     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
02560     RegParmTypes.push_back(IntVT);
02561     if (VecVT != MVT::Other)
02562       RegParmTypes.push_back(VecVT);
02563 
02564     // Compute the set of forwarded registers. The rest are scratch.
02565     SmallVectorImpl<ForwardedRegister> &Forwards =
02566         FuncInfo->getForwardedMustTailRegParms();
02567     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
02568 
02569     // Conservatively forward AL on x86_64, since it might be used for varargs.
02570     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
02571       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02572       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
02573     }
02574 
02575     // Copy all forwards from physical to virtual registers.
02576     for (ForwardedRegister &F : Forwards) {
02577       // FIXME: Can we use a less constrained schedule?
02578       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02579       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
02580       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
02581     }
02582   }
02583 
02584   // Some CCs need callee pop.
02585   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02586                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02587     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02588   } else {
02589     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02590     // If this is an sret function, the return should pop the hidden pointer.
02591     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02592         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02593         argsAreStructReturn(Ins) == StackStructReturn)
02594       FuncInfo->setBytesToPopOnReturn(4);
02595   }
02596 
02597   if (!Is64Bit) {
02598     // RegSaveFrameIndex is X86-64 only.
02599     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02600     if (CallConv == CallingConv::X86_FastCall ||
02601         CallConv == CallingConv::X86_ThisCall)
02602       // fastcc functions can't have varargs.
02603       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02604   }
02605 
02606   FuncInfo->setArgumentStackSize(StackSize);
02607 
02608   if (IsWinEHParent) {
02609     int UnwindHelpFI = MFI->CreateStackObject(8, 8, /*isSS=*/false);
02610     SDValue StackSlot = DAG.getFrameIndex(UnwindHelpFI, MVT::i64);
02611     MMI.getWinEHFuncInfo(MF.getFunction()).UnwindHelpFrameIdx = UnwindHelpFI;
02612     SDValue Neg2 = DAG.getConstant(-2, MVT::i64);
02613     Chain = DAG.getStore(Chain, dl, Neg2, StackSlot,
02614                          MachinePointerInfo::getFixedStack(UnwindHelpFI),
02615                          /*isVolatile=*/true,
02616                          /*isNonTemporal=*/false, /*Alignment=*/0);
02617   }
02618 
02619   return Chain;
02620 }
02621 
02622 SDValue
02623 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02624                                     SDValue StackPtr, SDValue Arg,
02625                                     SDLoc dl, SelectionDAG &DAG,
02626                                     const CCValAssign &VA,
02627                                     ISD::ArgFlagsTy Flags) const {
02628   unsigned LocMemOffset = VA.getLocMemOffset();
02629   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02630   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02631   if (Flags.isByVal())
02632     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02633 
02634   return DAG.getStore(Chain, dl, Arg, PtrOff,
02635                       MachinePointerInfo::getStack(LocMemOffset),
02636                       false, false, 0);
02637 }
02638 
02639 /// Emit a load of return address if tail call
02640 /// optimization is performed and it is required.
02641 SDValue
02642 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02643                                            SDValue &OutRetAddr, SDValue Chain,
02644                                            bool IsTailCall, bool Is64Bit,
02645                                            int FPDiff, SDLoc dl) const {
02646   // Adjust the Return address stack slot.
02647   EVT VT = getPointerTy();
02648   OutRetAddr = getReturnAddressFrameIndex(DAG);
02649 
02650   // Load the "old" Return address.
02651   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02652                            false, false, false, 0);
02653   return SDValue(OutRetAddr.getNode(), 1);
02654 }
02655 
02656 /// Emit a store of the return address if tail call
02657 /// optimization is performed and it is required (FPDiff!=0).
02658 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02659                                         SDValue Chain, SDValue RetAddrFrIdx,
02660                                         EVT PtrVT, unsigned SlotSize,
02661                                         int FPDiff, SDLoc dl) {
02662   // Store the return address to the appropriate stack slot.
02663   if (!FPDiff) return Chain;
02664   // Calculate the new stack slot for the return address.
02665   int NewReturnAddrFI =
02666     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02667                                          false);
02668   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02669   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02670                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02671                        false, false, 0);
02672   return Chain;
02673 }
02674 
02675 SDValue
02676 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02677                              SmallVectorImpl<SDValue> &InVals) const {
02678   SelectionDAG &DAG                     = CLI.DAG;
02679   SDLoc &dl                             = CLI.DL;
02680   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02681   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02682   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02683   SDValue Chain                         = CLI.Chain;
02684   SDValue Callee                        = CLI.Callee;
02685   CallingConv::ID CallConv              = CLI.CallConv;
02686   bool &isTailCall                      = CLI.IsTailCall;
02687   bool isVarArg                         = CLI.IsVarArg;
02688 
02689   MachineFunction &MF = DAG.getMachineFunction();
02690   bool Is64Bit        = Subtarget->is64Bit();
02691   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02692   StructReturnType SR = callIsStructReturn(Outs);
02693   bool IsSibcall      = false;
02694   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02695 
02696   if (MF.getTarget().Options.DisableTailCalls)
02697     isTailCall = false;
02698 
02699   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02700   if (IsMustTail) {
02701     // Force this to be a tail call.  The verifier rules are enough to ensure
02702     // that we can lower this successfully without moving the return address
02703     // around.
02704     isTailCall = true;
02705   } else if (isTailCall) {
02706     // Check if it's really possible to do a tail call.
02707     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02708                     isVarArg, SR != NotStructReturn,
02709                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02710                     Outs, OutVals, Ins, DAG);
02711 
02712     // Sibcalls are automatically detected tailcalls which do not require
02713     // ABI changes.
02714     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02715       IsSibcall = true;
02716 
02717     if (isTailCall)
02718       ++NumTailCalls;
02719   }
02720 
02721   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02722          "Var args not supported with calling convention fastcc, ghc or hipe");
02723 
02724   // Analyze operands of the call, assigning locations to each operand.
02725   SmallVector<CCValAssign, 16> ArgLocs;
02726   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02727 
02728   // Allocate shadow area for Win64
02729   if (IsWin64)
02730     CCInfo.AllocateStack(32, 8);
02731 
02732   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02733 
02734   // Get a count of how many bytes are to be pushed on the stack.
02735   unsigned NumBytes = CCInfo.getNextStackOffset();
02736   if (IsSibcall)
02737     // This is a sibcall. The memory operands are available in caller's
02738     // own caller's stack.
02739     NumBytes = 0;
02740   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02741            IsTailCallConvention(CallConv))
02742     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02743 
02744   int FPDiff = 0;
02745   if (isTailCall && !IsSibcall && !IsMustTail) {
02746     // Lower arguments at fp - stackoffset + fpdiff.
02747     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02748 
02749     FPDiff = NumBytesCallerPushed - NumBytes;
02750 
02751     // Set the delta of movement of the returnaddr stackslot.
02752     // But only set if delta is greater than previous delta.
02753     if (FPDiff < X86Info->getTCReturnAddrDelta())
02754       X86Info->setTCReturnAddrDelta(FPDiff);
02755   }
02756 
02757   unsigned NumBytesToPush = NumBytes;
02758   unsigned NumBytesToPop = NumBytes;
02759 
02760   // If we have an inalloca argument, all stack space has already been allocated
02761   // for us and be right at the top of the stack.  We don't support multiple
02762   // arguments passed in memory when using inalloca.
02763   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02764     NumBytesToPush = 0;
02765     if (!ArgLocs.back().isMemLoc())
02766       report_fatal_error("cannot use inalloca attribute on a register "
02767                          "parameter");
02768     if (ArgLocs.back().getLocMemOffset() != 0)
02769       report_fatal_error("any parameter with the inalloca attribute must be "
02770                          "the only memory argument");
02771   }
02772 
02773   if (!IsSibcall)
02774     Chain = DAG.getCALLSEQ_START(
02775         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02776 
02777   SDValue RetAddrFrIdx;
02778   // Load return address for tail calls.
02779   if (isTailCall && FPDiff)
02780     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02781                                     Is64Bit, FPDiff, dl);
02782 
02783   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02784   SmallVector<SDValue, 8> MemOpChains;
02785   SDValue StackPtr;
02786 
02787   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02788   // of tail call optimization arguments are handle later.
02789   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
02790   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02791     // Skip inalloca arguments, they have already been written.
02792     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02793     if (Flags.isInAlloca())
02794       continue;
02795 
02796     CCValAssign &VA = ArgLocs[i];
02797     EVT RegVT = VA.getLocVT();
02798     SDValue Arg = OutVals[i];
02799     bool isByVal = Flags.isByVal();
02800 
02801     // Promote the value if needed.
02802     switch (VA.getLocInfo()) {
02803     default: llvm_unreachable("Unknown loc info!");
02804     case CCValAssign::Full: break;
02805     case CCValAssign::SExt:
02806       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02807       break;
02808     case CCValAssign::ZExt:
02809       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02810       break;
02811     case CCValAssign::AExt:
02812       if (RegVT.is128BitVector()) {
02813         // Special case: passing MMX values in XMM registers.
02814         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02815         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02816         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02817       } else
02818         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02819       break;
02820     case CCValAssign::BCvt:
02821       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02822       break;
02823     case CCValAssign::Indirect: {
02824       // Store the argument.
02825       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02826       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02827       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02828                            MachinePointerInfo::getFixedStack(FI),
02829                            false, false, 0);
02830       Arg = SpillSlot;
02831       break;
02832     }
02833     }
02834 
02835     if (VA.isRegLoc()) {
02836       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02837       if (isVarArg && IsWin64) {
02838         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02839         // shadow reg if callee is a varargs function.
02840         unsigned ShadowReg = 0;
02841         switch (VA.getLocReg()) {
02842         case X86::XMM0: ShadowReg = X86::RCX; break;
02843         case X86::XMM1: ShadowReg = X86::RDX; break;
02844         case X86::XMM2: ShadowReg = X86::R8; break;
02845         case X86::XMM3: ShadowReg = X86::R9; break;
02846         }
02847         if (ShadowReg)
02848           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02849       }
02850     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02851       assert(VA.isMemLoc());
02852       if (!StackPtr.getNode())
02853         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02854                                       getPointerTy());
02855       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02856                                              dl, DAG, VA, Flags));
02857     }
02858   }
02859 
02860   if (!MemOpChains.empty())
02861     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02862 
02863   if (Subtarget->isPICStyleGOT()) {
02864     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02865     // GOT pointer.
02866     if (!isTailCall) {
02867       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02868                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02869     } else {
02870       // If we are tail calling and generating PIC/GOT style code load the
02871       // address of the callee into ECX. The value in ecx is used as target of
02872       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02873       // for tail calls on PIC/GOT architectures. Normally we would just put the
02874       // address of GOT into ebx and then call target@PLT. But for tail calls
02875       // ebx would be restored (since ebx is callee saved) before jumping to the
02876       // target@PLT.
02877 
02878       // Note: The actual moving to ECX is done further down.
02879       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02880       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02881           !G->getGlobal()->hasProtectedVisibility())
02882         Callee = LowerGlobalAddress(Callee, DAG);
02883       else if (isa<ExternalSymbolSDNode>(Callee))
02884         Callee = LowerExternalSymbol(Callee, DAG);
02885     }
02886   }
02887 
02888   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02889     // From AMD64 ABI document:
02890     // For calls that may call functions that use varargs or stdargs
02891     // (prototype-less calls or calls to functions containing ellipsis (...) in
02892     // the declaration) %al is used as hidden argument to specify the number
02893     // of SSE registers used. The contents of %al do not need to match exactly
02894     // the number of registers, but must be an ubound on the number of SSE
02895     // registers used and is in the range 0 - 8 inclusive.
02896 
02897     // Count the number of XMM registers allocated.
02898     static const MCPhysReg XMMArgRegs[] = {
02899       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02900       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02901     };
02902     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
02903     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02904            && "SSE registers cannot be used when SSE is disabled");
02905 
02906     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02907                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02908   }
02909 
02910   if (isVarArg && IsMustTail) {
02911     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02912     for (const auto &F : Forwards) {
02913       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02914       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02915     }
02916   }
02917 
02918   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02919   // don't need this because the eligibility check rejects calls that require
02920   // shuffling arguments passed in memory.
02921   if (!IsSibcall && isTailCall) {
02922     // Force all the incoming stack arguments to be loaded from the stack
02923     // before any new outgoing arguments are stored to the stack, because the
02924     // outgoing stack slots may alias the incoming argument stack slots, and
02925     // the alias isn't otherwise explicit. This is slightly more conservative
02926     // than necessary, because it means that each store effectively depends
02927     // on every argument instead of just those arguments it would clobber.
02928     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02929 
02930     SmallVector<SDValue, 8> MemOpChains2;
02931     SDValue FIN;
02932     int FI = 0;
02933     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02934       CCValAssign &VA = ArgLocs[i];
02935       if (VA.isRegLoc())
02936         continue;
02937       assert(VA.isMemLoc());
02938       SDValue Arg = OutVals[i];
02939       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02940       // Skip inalloca arguments.  They don't require any work.
02941       if (Flags.isInAlloca())
02942         continue;
02943       // Create frame index.
02944       int32_t Offset = VA.getLocMemOffset()+FPDiff;
02945       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02946       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02947       FIN = DAG.getFrameIndex(FI, getPointerTy());
02948 
02949       if (Flags.isByVal()) {
02950         // Copy relative to framepointer.
02951         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02952         if (!StackPtr.getNode())
02953           StackPtr = DAG.getCopyFromReg(Chain, dl,
02954                                         RegInfo->getStackRegister(),
02955                                         getPointerTy());
02956         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02957 
02958         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02959                                                          ArgChain,
02960                                                          Flags, DAG, dl));
02961       } else {
02962         // Store relative to framepointer.
02963         MemOpChains2.push_back(
02964           DAG.getStore(ArgChain, dl, Arg, FIN,
02965                        MachinePointerInfo::getFixedStack(FI),
02966                        false, false, 0));
02967       }
02968     }
02969 
02970     if (!MemOpChains2.empty())
02971       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
02972 
02973     // Store the return address to the appropriate stack slot.
02974     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02975                                      getPointerTy(), RegInfo->getSlotSize(),
02976                                      FPDiff, dl);
02977   }
02978 
02979   // Build a sequence of copy-to-reg nodes chained together with token chain
02980   // and flag operands which copy the outgoing args into registers.
02981   SDValue InFlag;
02982   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02983     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02984                              RegsToPass[i].second, InFlag);
02985     InFlag = Chain.getValue(1);
02986   }
02987 
02988   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
02989     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02990     // In the 64-bit large code model, we have to make all calls
02991     // through a register, since the call instruction's 32-bit
02992     // pc-relative offset may not be large enough to hold the whole
02993     // address.
02994   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
02995     // If the callee is a GlobalAddress node (quite common, every direct call
02996     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02997     // it.
02998     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
02999 
03000     // We should use extra load for direct calls to dllimported functions in
03001     // non-JIT mode.
03002     const GlobalValue *GV = G->getGlobal();
03003     if (!GV->hasDLLImportStorageClass()) {
03004       unsigned char OpFlags = 0;
03005       bool ExtraLoad = false;
03006       unsigned WrapperKind = ISD::DELETED_NODE;
03007 
03008       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03009       // external symbols most go through the PLT in PIC mode.  If the symbol
03010       // has hidden or protected visibility, or if it is static or local, then
03011       // we don't need to use the PLT - we can directly call it.
03012       if (Subtarget->isTargetELF() &&
03013           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03014           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03015         OpFlags = X86II::MO_PLT;
03016       } else if (Subtarget->isPICStyleStubAny() &&
03017                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03018                  (!Subtarget->getTargetTriple().isMacOSX() ||
03019                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03020         // PC-relative references to external symbols should go through $stub,
03021         // unless we're building with the leopard linker or later, which
03022         // automatically synthesizes these stubs.
03023         OpFlags = X86II::MO_DARWIN_STUB;
03024       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
03025                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
03026         // If the function is marked as non-lazy, generate an indirect call
03027         // which loads from the GOT directly. This avoids runtime overhead
03028         // at the cost of eager binding (and one extra byte of encoding).
03029         OpFlags = X86II::MO_GOTPCREL;
03030         WrapperKind = X86ISD::WrapperRIP;
03031         ExtraLoad = true;
03032       }
03033 
03034       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03035                                           G->getOffset(), OpFlags);
03036 
03037       // Add a wrapper if needed.
03038       if (WrapperKind != ISD::DELETED_NODE)
03039         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03040       // Add extra indirection if needed.
03041       if (ExtraLoad)
03042         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03043                              MachinePointerInfo::getGOT(),
03044                              false, false, false, 0);
03045     }
03046   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03047     unsigned char OpFlags = 0;
03048 
03049     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03050     // external symbols should go through the PLT.
03051     if (Subtarget->isTargetELF() &&
03052         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03053       OpFlags = X86II::MO_PLT;
03054     } else if (Subtarget->isPICStyleStubAny() &&
03055                (!Subtarget->getTargetTriple().isMacOSX() ||
03056                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03057       // PC-relative references to external symbols should go through $stub,
03058       // unless we're building with the leopard linker or later, which
03059       // automatically synthesizes these stubs.
03060       OpFlags = X86II::MO_DARWIN_STUB;
03061     }
03062 
03063     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03064                                          OpFlags);
03065   } else if (Subtarget->isTarget64BitILP32() &&
03066              Callee->getValueType(0) == MVT::i32) {
03067     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03068     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03069   }
03070 
03071   // Returns a chain & a flag for retval copy to use.
03072   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03073   SmallVector<SDValue, 8> Ops;
03074 
03075   if (!IsSibcall && isTailCall) {
03076     Chain = DAG.getCALLSEQ_END(Chain,
03077                                DAG.getIntPtrConstant(NumBytesToPop, true),
03078                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03079     InFlag = Chain.getValue(1);
03080   }
03081 
03082   Ops.push_back(Chain);
03083   Ops.push_back(Callee);
03084 
03085   if (isTailCall)
03086     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03087 
03088   // Add argument registers to the end of the list so that they are known live
03089   // into the call.
03090   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03091     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03092                                   RegsToPass[i].second.getValueType()));
03093 
03094   // Add a register mask operand representing the call-preserved registers.
03095   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
03096   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
03097   assert(Mask && "Missing call preserved mask for calling convention");
03098   Ops.push_back(DAG.getRegisterMask(Mask));
03099 
03100   if (InFlag.getNode())
03101     Ops.push_back(InFlag);
03102 
03103   if (isTailCall) {
03104     // We used to do:
03105     //// If this is the first return lowered for this function, add the regs
03106     //// to the liveout set for the function.
03107     // This isn't right, although it's probably harmless on x86; liveouts
03108     // should be computed from returns not tail calls.  Consider a void
03109     // function making a tail call to a function returning int.
03110     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03111   }
03112 
03113   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03114   InFlag = Chain.getValue(1);
03115 
03116   // Create the CALLSEQ_END node.
03117   unsigned NumBytesForCalleeToPop;
03118   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03119                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03120     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03121   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03122            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03123            SR == StackStructReturn)
03124     // If this is a call to a struct-return function, the callee
03125     // pops the hidden struct pointer, so we have to push it back.
03126     // This is common for Darwin/X86, Linux & Mingw32 targets.
03127     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03128     NumBytesForCalleeToPop = 4;
03129   else
03130     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03131 
03132   // Returns a flag for retval copy to use.
03133   if (!IsSibcall) {
03134     Chain = DAG.getCALLSEQ_END(Chain,
03135                                DAG.getIntPtrConstant(NumBytesToPop, true),
03136                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03137                                                      true),
03138                                InFlag, dl);
03139     InFlag = Chain.getValue(1);
03140   }
03141 
03142   // Handle result values, copying them out of physregs into vregs that we
03143   // return.
03144   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03145                          Ins, dl, DAG, InVals);
03146 }
03147 
03148 //===----------------------------------------------------------------------===//
03149 //                Fast Calling Convention (tail call) implementation
03150 //===----------------------------------------------------------------------===//
03151 
03152 //  Like std call, callee cleans arguments, convention except that ECX is
03153 //  reserved for storing the tail called function address. Only 2 registers are
03154 //  free for argument passing (inreg). Tail call optimization is performed
03155 //  provided:
03156 //                * tailcallopt is enabled
03157 //                * caller/callee are fastcc
03158 //  On X86_64 architecture with GOT-style position independent code only local
03159 //  (within module) calls are supported at the moment.
03160 //  To keep the stack aligned according to platform abi the function
03161 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03162 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03163 //  If a tail called function callee has more arguments than the caller the
03164 //  caller needs to make sure that there is room to move the RETADDR to. This is
03165 //  achieved by reserving an area the size of the argument delta right after the
03166 //  original RETADDR, but before the saved framepointer or the spilled registers
03167 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03168 //  stack layout:
03169 //    arg1
03170 //    arg2
03171 //    RETADDR
03172 //    [ new RETADDR
03173 //      move area ]
03174 //    (possible EBP)
03175 //    ESI
03176 //    EDI
03177 //    local1 ..
03178 
03179 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03180 /// for a 16 byte align requirement.
03181 unsigned
03182 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03183                                                SelectionDAG& DAG) const {
03184   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03185   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
03186   unsigned StackAlignment = TFI.getStackAlignment();
03187   uint64_t AlignMask = StackAlignment - 1;
03188   int64_t Offset = StackSize;
03189   unsigned SlotSize = RegInfo->getSlotSize();
03190   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03191     // Number smaller than 12 so just add the difference.
03192     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03193   } else {
03194     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03195     Offset = ((~AlignMask) & Offset) + StackAlignment +
03196       (StackAlignment-SlotSize);
03197   }
03198   return Offset;
03199 }
03200 
03201 /// MatchingStackOffset - Return true if the given stack call argument is
03202 /// already available in the same position (relatively) of the caller's
03203 /// incoming argument stack.
03204 static
03205 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03206                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03207                          const X86InstrInfo *TII) {
03208   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03209   int FI = INT_MAX;
03210   if (Arg.getOpcode() == ISD::CopyFromReg) {
03211     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03212     if (!TargetRegisterInfo::isVirtualRegister(VR))
03213       return false;
03214     MachineInstr *Def = MRI->getVRegDef(VR);
03215     if (!Def)
03216       return false;
03217     if (!Flags.isByVal()) {
03218       if (!TII->isLoadFromStackSlot(Def, FI))
03219         return false;
03220     } else {
03221       unsigned Opcode = Def->getOpcode();
03222       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
03223            Opcode == X86::LEA64_32r) &&
03224           Def->getOperand(1).isFI()) {
03225         FI = Def->getOperand(1).getIndex();
03226         Bytes = Flags.getByValSize();
03227       } else
03228         return false;
03229     }
03230   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03231     if (Flags.isByVal())
03232       // ByVal argument is passed in as a pointer but it's now being
03233       // dereferenced. e.g.
03234       // define @foo(%struct.X* %A) {
03235       //   tail call @bar(%struct.X* byval %A)
03236       // }
03237       return false;
03238     SDValue Ptr = Ld->getBasePtr();
03239     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03240     if (!FINode)
03241       return false;
03242     FI = FINode->getIndex();
03243   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03244     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03245     FI = FINode->getIndex();
03246     Bytes = Flags.getByValSize();
03247   } else
03248     return false;
03249 
03250   assert(FI != INT_MAX);
03251   if (!MFI->isFixedObjectIndex(FI))
03252     return false;
03253   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03254 }
03255 
03256 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03257 /// for tail call optimization. Targets which want to do tail call
03258 /// optimization should implement this function.
03259 bool
03260 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03261                                                      CallingConv::ID CalleeCC,
03262                                                      bool isVarArg,
03263                                                      bool isCalleeStructRet,
03264                                                      bool isCallerStructRet,
03265                                                      Type *RetTy,
03266                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03267                                     const SmallVectorImpl<SDValue> &OutVals,
03268                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03269                                                      SelectionDAG &DAG) const {
03270   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03271     return false;
03272 
03273   // If -tailcallopt is specified, make fastcc functions tail-callable.
03274   const MachineFunction &MF = DAG.getMachineFunction();
03275   const Function *CallerF = MF.getFunction();
03276 
03277   // If the function return type is x86_fp80 and the callee return type is not,
03278   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03279   // perform a tailcall optimization here.
03280   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03281     return false;
03282 
03283   CallingConv::ID CallerCC = CallerF->getCallingConv();
03284   bool CCMatch = CallerCC == CalleeCC;
03285   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03286   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03287 
03288   // Win64 functions have extra shadow space for argument homing. Don't do the
03289   // sibcall if the caller and callee have mismatched expectations for this
03290   // space.
03291   if (IsCalleeWin64 != IsCallerWin64)
03292     return false;
03293 
03294   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03295     if (IsTailCallConvention(CalleeCC) && CCMatch)
03296       return true;
03297     return false;
03298   }
03299 
03300   // Look for obvious safe cases to perform tail call optimization that do not
03301   // require ABI changes. This is what gcc calls sibcall.
03302 
03303   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03304   // emit a special epilogue.
03305   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03306   if (RegInfo->needsStackRealignment(MF))
03307     return false;
03308 
03309   // Also avoid sibcall optimization if either caller or callee uses struct
03310   // return semantics.
03311   if (isCalleeStructRet || isCallerStructRet)
03312     return false;
03313 
03314   // An stdcall/thiscall caller is expected to clean up its arguments; the
03315   // callee isn't going to do that.
03316   // FIXME: this is more restrictive than needed. We could produce a tailcall
03317   // when the stack adjustment matches. For example, with a thiscall that takes
03318   // only one argument.
03319   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03320                    CallerCC == CallingConv::X86_ThisCall))
03321     return false;
03322 
03323   // Do not sibcall optimize vararg calls unless all arguments are passed via
03324   // registers.
03325   if (isVarArg && !Outs.empty()) {
03326 
03327     // Optimizing for varargs on Win64 is unlikely to be safe without
03328     // additional testing.
03329     if (IsCalleeWin64 || IsCallerWin64)
03330       return false;
03331 
03332     SmallVector<CCValAssign, 16> ArgLocs;
03333     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03334                    *DAG.getContext());
03335 
03336     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03337     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03338       if (!ArgLocs[i].isRegLoc())
03339         return false;
03340   }
03341 
03342   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03343   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03344   // this into a sibcall.
03345   bool Unused = false;
03346   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03347     if (!Ins[i].Used) {
03348       Unused = true;
03349       break;
03350     }
03351   }
03352   if (Unused) {
03353     SmallVector<CCValAssign, 16> RVLocs;
03354     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03355                    *DAG.getContext());
03356     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03357     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03358       CCValAssign &VA = RVLocs[i];
03359       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03360         return false;
03361     }
03362   }
03363 
03364   // If the calling conventions do not match, then we'd better make sure the
03365   // results are returned in the same way as what the caller expects.
03366   if (!CCMatch) {
03367     SmallVector<CCValAssign, 16> RVLocs1;
03368     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03369                     *DAG.getContext());
03370     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03371 
03372     SmallVector<CCValAssign, 16> RVLocs2;
03373     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03374                     *DAG.getContext());
03375     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03376 
03377     if (RVLocs1.size() != RVLocs2.size())
03378       return false;
03379     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03380       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03381         return false;
03382       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03383         return false;
03384       if (RVLocs1[i].isRegLoc()) {
03385         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03386           return false;
03387       } else {
03388         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03389           return false;
03390       }
03391     }
03392   }
03393 
03394   // If the callee takes no arguments then go on to check the results of the
03395   // call.
03396   if (!Outs.empty()) {
03397     // Check if stack adjustment is needed. For now, do not do this if any
03398     // argument is passed on the stack.
03399     SmallVector<CCValAssign, 16> ArgLocs;
03400     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03401                    *DAG.getContext());
03402 
03403     // Allocate shadow area for Win64
03404     if (IsCalleeWin64)
03405       CCInfo.AllocateStack(32, 8);
03406 
03407     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03408     if (CCInfo.getNextStackOffset()) {
03409       MachineFunction &MF = DAG.getMachineFunction();
03410       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03411         return false;
03412 
03413       // Check if the arguments are already laid out in the right way as
03414       // the caller's fixed stack objects.
03415       MachineFrameInfo *MFI = MF.getFrameInfo();
03416       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03417       const X86InstrInfo *TII = Subtarget->getInstrInfo();
03418       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03419         CCValAssign &VA = ArgLocs[i];
03420         SDValue Arg = OutVals[i];
03421         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03422         if (VA.getLocInfo() == CCValAssign::Indirect)
03423           return false;
03424         if (!VA.isRegLoc()) {
03425           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03426                                    MFI, MRI, TII))
03427             return false;
03428         }
03429       }
03430     }
03431 
03432     // If the tailcall address may be in a register, then make sure it's
03433     // possible to register allocate for it. In 32-bit, the call address can
03434     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03435     // callee-saved registers are restored. These happen to be the same
03436     // registers used to pass 'inreg' arguments so watch out for those.
03437     if (!Subtarget->is64Bit() &&
03438         ((!isa<GlobalAddressSDNode>(Callee) &&
03439           !isa<ExternalSymbolSDNode>(Callee)) ||
03440          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03441       unsigned NumInRegs = 0;
03442       // In PIC we need an extra register to formulate the address computation
03443       // for the callee.
03444       unsigned MaxInRegs =
03445         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03446 
03447       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03448         CCValAssign &VA = ArgLocs[i];
03449         if (!VA.isRegLoc())
03450           continue;
03451         unsigned Reg = VA.getLocReg();
03452         switch (Reg) {
03453         default: break;
03454         case X86::EAX: case X86::EDX: case X86::ECX:
03455           if (++NumInRegs == MaxInRegs)
03456             return false;
03457           break;
03458         }
03459       }
03460     }
03461   }
03462 
03463   return true;
03464 }
03465 
03466 FastISel *
03467 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03468                                   const TargetLibraryInfo *libInfo) const {
03469   return X86::createFastISel(funcInfo, libInfo);
03470 }
03471 
03472 //===----------------------------------------------------------------------===//
03473 //                           Other Lowering Hooks
03474 //===----------------------------------------------------------------------===//
03475 
03476 static bool MayFoldLoad(SDValue Op) {
03477   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03478 }
03479 
03480 static bool MayFoldIntoStore(SDValue Op) {
03481   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03482 }
03483 
03484 static bool isTargetShuffle(unsigned Opcode) {
03485   switch(Opcode) {
03486   default: return false;
03487   case X86ISD::BLENDI:
03488   case X86ISD::PSHUFB:
03489   case X86ISD::PSHUFD:
03490   case X86ISD::PSHUFHW:
03491   case X86ISD::PSHUFLW:
03492   case X86ISD::SHUFP:
03493   case X86ISD::PALIGNR:
03494   case X86ISD::MOVLHPS:
03495   case X86ISD::MOVLHPD:
03496   case X86ISD::MOVHLPS:
03497   case X86ISD::MOVLPS:
03498   case X86ISD::MOVLPD:
03499   case X86ISD::MOVSHDUP:
03500   case X86ISD::MOVSLDUP:
03501   case X86ISD::MOVDDUP:
03502   case X86ISD::MOVSS:
03503   case X86ISD::MOVSD:
03504   case X86ISD::UNPCKL:
03505   case X86ISD::UNPCKH:
03506   case X86ISD::VPERMILPI:
03507   case X86ISD::VPERM2X128:
03508   case X86ISD::VPERMI:
03509     return true;
03510   }
03511 }
03512 
03513 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03514                                     SDValue V1, unsigned TargetMask,
03515                                     SelectionDAG &DAG) {
03516   switch(Opc) {
03517   default: llvm_unreachable("Unknown x86 shuffle node");
03518   case X86ISD::PSHUFD:
03519   case X86ISD::PSHUFHW:
03520   case X86ISD::PSHUFLW:
03521   case X86ISD::VPERMILPI:
03522   case X86ISD::VPERMI:
03523     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03524   }
03525 }
03526 
03527 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03528                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03529   switch(Opc) {
03530   default: llvm_unreachable("Unknown x86 shuffle node");
03531   case X86ISD::MOVLHPS:
03532   case X86ISD::MOVLHPD:
03533   case X86ISD::MOVHLPS:
03534   case X86ISD::MOVLPS:
03535   case X86ISD::MOVLPD:
03536   case X86ISD::MOVSS:
03537   case X86ISD::MOVSD:
03538   case X86ISD::UNPCKL:
03539   case X86ISD::UNPCKH:
03540     return DAG.getNode(Opc, dl, VT, V1, V2);
03541   }
03542 }
03543 
03544 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03545   MachineFunction &MF = DAG.getMachineFunction();
03546   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03547   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03548   int ReturnAddrIndex = FuncInfo->getRAIndex();
03549 
03550   if (ReturnAddrIndex == 0) {
03551     // Set up a frame object for the return address.
03552     unsigned SlotSize = RegInfo->getSlotSize();
03553     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03554                                                            -(int64_t)SlotSize,
03555                                                            false);
03556     FuncInfo->setRAIndex(ReturnAddrIndex);
03557   }
03558 
03559   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03560 }
03561 
03562 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03563                                        bool hasSymbolicDisplacement) {
03564   // Offset should fit into 32 bit immediate field.
03565   if (!isInt<32>(Offset))
03566     return false;
03567 
03568   // If we don't have a symbolic displacement - we don't have any extra
03569   // restrictions.
03570   if (!hasSymbolicDisplacement)
03571     return true;
03572 
03573   // FIXME: Some tweaks might be needed for medium code model.
03574   if (M != CodeModel::Small && M != CodeModel::Kernel)
03575     return false;
03576 
03577   // For small code model we assume that latest object is 16MB before end of 31
03578   // bits boundary. We may also accept pretty large negative constants knowing
03579   // that all objects are in the positive half of address space.
03580   if (M == CodeModel::Small && Offset < 16*1024*1024)
03581     return true;
03582 
03583   // For kernel code model we know that all object resist in the negative half
03584   // of 32bits address space. We may not accept negative offsets, since they may
03585   // be just off and we may accept pretty large positive ones.
03586   if (M == CodeModel::Kernel && Offset >= 0)
03587     return true;
03588 
03589   return false;
03590 }
03591 
03592 /// isCalleePop - Determines whether the callee is required to pop its
03593 /// own arguments. Callee pop is necessary to support tail calls.
03594 bool X86::isCalleePop(CallingConv::ID CallingConv,
03595                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03596   switch (CallingConv) {
03597   default:
03598     return false;
03599   case CallingConv::X86_StdCall:
03600   case CallingConv::X86_FastCall:
03601   case CallingConv::X86_ThisCall:
03602     return !is64Bit;
03603   case CallingConv::Fast:
03604   case CallingConv::GHC:
03605   case CallingConv::HiPE:
03606     if (IsVarArg)
03607       return false;
03608     return TailCallOpt;
03609   }
03610 }
03611 
03612 /// \brief Return true if the condition is an unsigned comparison operation.
03613 static bool isX86CCUnsigned(unsigned X86CC) {
03614   switch (X86CC) {
03615   default: llvm_unreachable("Invalid integer condition!");
03616   case X86::COND_E:     return true;
03617   case X86::COND_G:     return false;
03618   case X86::COND_GE:    return false;
03619   case X86::COND_L:     return false;
03620   case X86::COND_LE:    return false;
03621   case X86::COND_NE:    return true;
03622   case X86::COND_B:     return true;
03623   case X86::COND_A:     return true;
03624   case X86::COND_BE:    return true;
03625   case X86::COND_AE:    return true;
03626   }
03627   llvm_unreachable("covered switch fell through?!");
03628 }
03629 
03630 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03631 /// specific condition code, returning the condition code and the LHS/RHS of the
03632 /// comparison to make.
03633 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03634                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03635   if (!isFP) {
03636     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03637       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03638         // X > -1   -> X == 0, jump !sign.
03639         RHS = DAG.getConstant(0, RHS.getValueType());
03640         return X86::COND_NS;
03641       }
03642       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03643         // X < 0   -> X == 0, jump on sign.
03644         return X86::COND_S;
03645       }
03646       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03647         // X < 1   -> X <= 0
03648         RHS = DAG.getConstant(0, RHS.getValueType());
03649         return X86::COND_LE;
03650       }
03651     }
03652 
03653     switch (SetCCOpcode) {
03654     default: llvm_unreachable("Invalid integer condition!");
03655     case ISD::SETEQ:  return X86::COND_E;
03656     case ISD::SETGT:  return X86::COND_G;
03657     case ISD::SETGE:  return X86::COND_GE;
03658     case ISD::SETLT:  return X86::COND_L;
03659     case ISD::SETLE:  return X86::COND_LE;
03660     case ISD::SETNE:  return X86::COND_NE;
03661     case ISD::SETULT: return X86::COND_B;
03662     case ISD::SETUGT: return X86::COND_A;
03663     case ISD::SETULE: return X86::COND_BE;
03664     case ISD::SETUGE: return X86::COND_AE;
03665     }
03666   }
03667 
03668   // First determine if it is required or is profitable to flip the operands.
03669 
03670   // If LHS is a foldable load, but RHS is not, flip the condition.
03671   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03672       !ISD::isNON_EXTLoad(RHS.getNode())) {
03673     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03674     std::swap(LHS, RHS);
03675   }
03676 
03677   switch (SetCCOpcode) {
03678   default: break;
03679   case ISD::SETOLT:
03680   case ISD::SETOLE:
03681   case ISD::SETUGT:
03682   case ISD::SETUGE:
03683     std::swap(LHS, RHS);
03684     break;
03685   }
03686 
03687   // On a floating point condition, the flags are set as follows:
03688   // ZF  PF  CF   op
03689   //  0 | 0 | 0 | X > Y
03690   //  0 | 0 | 1 | X < Y
03691   //  1 | 0 | 0 | X == Y
03692   //  1 | 1 | 1 | unordered
03693   switch (SetCCOpcode) {
03694   default: llvm_unreachable("Condcode should be pre-legalized away");
03695   case ISD::SETUEQ:
03696   case ISD::SETEQ:   return X86::COND_E;
03697   case ISD::SETOLT:              // flipped
03698   case ISD::SETOGT:
03699   case ISD::SETGT:   return X86::COND_A;
03700   case ISD::SETOLE:              // flipped
03701   case ISD::SETOGE:
03702   case ISD::SETGE:   return X86::COND_AE;
03703   case ISD::SETUGT:              // flipped
03704   case ISD::SETULT:
03705   case ISD::SETLT:   return X86::COND_B;
03706   case ISD::SETUGE:              // flipped
03707   case ISD::SETULE:
03708   case ISD::SETLE:   return X86::COND_BE;
03709   case ISD::SETONE:
03710   case ISD::SETNE:   return X86::COND_NE;
03711   case ISD::SETUO:   return X86::COND_P;
03712   case ISD::SETO:    return X86::COND_NP;
03713   case ISD::SETOEQ:
03714   case ISD::SETUNE:  return X86::COND_INVALID;
03715   }
03716 }
03717 
03718 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03719 /// code. Current x86 isa includes the following FP cmov instructions:
03720 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03721 static bool hasFPCMov(unsigned X86CC) {
03722   switch (X86CC) {
03723   default:
03724     return false;
03725   case X86::COND_B:
03726   case X86::COND_BE:
03727   case X86::COND_E:
03728   case X86::COND_P:
03729   case X86::COND_A:
03730   case X86::COND_AE:
03731   case X86::COND_NE:
03732   case X86::COND_NP:
03733     return true;
03734   }
03735 }
03736 
03737 /// isFPImmLegal - Returns true if the target can instruction select the
03738 /// specified FP immediate natively. If false, the legalizer will
03739 /// materialize the FP immediate as a load from a constant pool.
03740 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03741   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03742     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03743       return true;
03744   }
03745   return false;
03746 }
03747 
03748 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
03749                                               ISD::LoadExtType ExtTy,
03750                                               EVT NewVT) const {
03751   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
03752   // relocation target a movq or addq instruction: don't let the load shrink.
03753   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
03754   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
03755     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
03756       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
03757   return true;
03758 }
03759 
03760 /// \brief Returns true if it is beneficial to convert a load of a constant
03761 /// to just the constant itself.
03762 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03763                                                           Type *Ty) const {
03764   assert(Ty->isIntegerTy());
03765 
03766   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03767   if (BitSize == 0 || BitSize > 64)
03768     return false;
03769   return true;
03770 }
03771 
03772 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
03773                                                 unsigned Index) const {
03774   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
03775     return false;
03776 
03777   return (Index == 0 || Index == ResVT.getVectorNumElements());
03778 }
03779 
03780 bool X86TargetLowering::isCheapToSpeculateCttz() const {
03781   // Speculate cttz only if we can directly use TZCNT.
03782   return Subtarget->hasBMI();
03783 }
03784 
03785 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
03786   // Speculate ctlz only if we can directly use LZCNT.
03787   return Subtarget->hasLZCNT();
03788 }
03789 
03790 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03791 /// the specified range (L, H].
03792 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03793   return (Val < 0) || (Val >= Low && Val < Hi);
03794 }
03795 
03796 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03797 /// specified value.
03798 static bool isUndefOrEqual(int Val, int CmpVal) {
03799   return (Val < 0 || Val == CmpVal);
03800 }
03801 
03802 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03803 /// from position Pos and ending in Pos+Size, falls within the specified
03804 /// sequential range (Low, Low+Size]. or is undef.
03805 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03806                                        unsigned Pos, unsigned Size, int Low) {
03807   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03808     if (!isUndefOrEqual(Mask[i], Low))
03809       return false;
03810   return true;
03811 }
03812 
03813 /// isVEXTRACTIndex - Return true if the specified
03814 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
03815 /// suitable for instruction that extract 128 or 256 bit vectors
03816 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
03817   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03818   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03819     return false;
03820 
03821   // The index should be aligned on a vecWidth-bit boundary.
03822   uint64_t Index =
03823     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03824 
03825   MVT VT = N->getSimpleValueType(0);
03826   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03827   bool Result = (Index * ElSize) % vecWidth == 0;
03828 
03829   return Result;
03830 }
03831 
03832 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
03833 /// operand specifies a subvector insert that is suitable for input to
03834 /// insertion of 128 or 256-bit subvectors
03835 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
03836   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03837   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03838     return false;
03839   // The index should be aligned on a vecWidth-bit boundary.
03840   uint64_t Index =
03841     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03842 
03843   MVT VT = N->getSimpleValueType(0);
03844   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03845   bool Result = (Index * ElSize) % vecWidth == 0;
03846 
03847   return Result;
03848 }
03849 
03850 bool X86::isVINSERT128Index(SDNode *N) {
03851   return isVINSERTIndex(N, 128);
03852 }
03853 
03854 bool X86::isVINSERT256Index(SDNode *N) {
03855   return isVINSERTIndex(N, 256);
03856 }
03857 
03858 bool X86::isVEXTRACT128Index(SDNode *N) {
03859   return isVEXTRACTIndex(N, 128);
03860 }
03861 
03862 bool X86::isVEXTRACT256Index(SDNode *N) {
03863   return isVEXTRACTIndex(N, 256);
03864 }
03865 
03866 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
03867   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03868   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03869     llvm_unreachable("Illegal extract subvector for VEXTRACT");
03870 
03871   uint64_t Index =
03872     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03873 
03874   MVT VecVT = N->getOperand(0).getSimpleValueType();
03875   MVT ElVT = VecVT.getVectorElementType();
03876 
03877   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
03878   return Index / NumElemsPerChunk;
03879 }
03880 
03881 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
03882   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03883   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03884     llvm_unreachable("Illegal insert subvector for VINSERT");
03885 
03886   uint64_t Index =
03887     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03888 
03889   MVT VecVT = N->getSimpleValueType(0);
03890   MVT ElVT = VecVT.getVectorElementType();
03891 
03892   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
03893   return Index / NumElemsPerChunk;
03894 }
03895 
03896 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
03897 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
03898 /// and VINSERTI128 instructions.
03899 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
03900   return getExtractVEXTRACTImmediate(N, 128);
03901 }
03902 
03903 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
03904 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
03905 /// and VINSERTI64x4 instructions.
03906 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
03907   return getExtractVEXTRACTImmediate(N, 256);
03908 }
03909 
03910 /// getInsertVINSERT128Immediate - Return the appropriate immediate
03911 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
03912 /// and VINSERTI128 instructions.
03913 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
03914   return getInsertVINSERTImmediate(N, 128);
03915 }
03916 
03917 /// getInsertVINSERT256Immediate - Return the appropriate immediate
03918 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
03919 /// and VINSERTI64x4 instructions.
03920 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
03921   return getInsertVINSERTImmediate(N, 256);
03922 }
03923 
03924 /// isZero - Returns true if Elt is a constant integer zero
03925 static bool isZero(SDValue V) {
03926   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
03927   return C && C->isNullValue();
03928 }
03929 
03930 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
03931 /// constant +0.0.
03932 bool X86::isZeroNode(SDValue Elt) {
03933   if (isZero(Elt))
03934     return true;
03935   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
03936     return CFP->getValueAPF().isPosZero();
03937   return false;
03938 }
03939 
03940 /// getZeroVector - Returns a vector of specified type with all zero elements.
03941 ///
03942 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
03943                              SelectionDAG &DAG, SDLoc dl) {
03944   assert(VT.isVector() && "Expected a vector type");
03945 
03946   // Always build SSE zero vectors as <4 x i32> bitcasted
03947   // to their dest type. This ensures they get CSE'd.
03948   SDValue Vec;
03949   if (VT.is128BitVector()) {  // SSE
03950     if (Subtarget->hasSSE2()) {  // SSE2
03951       SDValue Cst = DAG.getConstant(0, MVT::i32);
03952       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
03953     } else { // SSE1
03954       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
03955       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
03956     }
03957   } else if (VT.is256BitVector()) { // AVX
03958     if (Subtarget->hasInt256()) { // AVX2
03959       SDValue Cst = DAG.getConstant(0, MVT::i32);
03960       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03961       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
03962     } else {
03963       // 256-bit logic and arithmetic instructions in AVX are all
03964       // floating-point, no support for integer ops. Emit fp zeroed vectors.
03965       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
03966       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03967       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
03968     }
03969   } else if (VT.is512BitVector()) { // AVX-512
03970       SDValue Cst = DAG.getConstant(0, MVT::i32);
03971       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
03972                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03973       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
03974   } else if (VT.getScalarType() == MVT::i1) {
03975 
03976     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
03977             && "Unexpected vector type");
03978     assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
03979             && "Unexpected vector type");
03980     SDValue Cst = DAG.getConstant(0, MVT::i1);
03981     SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
03982     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
03983   } else
03984     llvm_unreachable("Unexpected vector type");
03985 
03986   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
03987 }
03988 
03989 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
03990                                 SelectionDAG &DAG, SDLoc dl,
03991                                 unsigned vectorWidth) {
03992   assert((vectorWidth == 128 || vectorWidth == 256) &&
03993          "Unsupported vector width");
03994   EVT VT = Vec.getValueType();
03995   EVT ElVT = VT.getVectorElementType();
03996   unsigned Factor = VT.getSizeInBits()/vectorWidth;
03997   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
03998                                   VT.getVectorNumElements()/Factor);
03999 
04000   // Extract from UNDEF is UNDEF.
04001   if (Vec.getOpcode() == ISD::UNDEF)
04002     return DAG.getUNDEF(ResultVT);
04003 
04004   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
04005   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
04006 
04007   // This is the index of the first element of the vectorWidth-bit chunk
04008   // we want.
04009   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
04010                                * ElemsPerChunk);
04011 
04012   // If the input is a buildvector just emit a smaller one.
04013   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
04014     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
04015                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
04016                                     ElemsPerChunk));
04017 
04018   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
04019   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
04020 }
04021 
04022 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
04023 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
04024 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
04025 /// instructions or a simple subregister reference. Idx is an index in the
04026 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
04027 /// lowering EXTRACT_VECTOR_ELT operations easier.
04028 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
04029                                    SelectionDAG &DAG, SDLoc dl) {
04030   assert((Vec.getValueType().is256BitVector() ||
04031           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
04032   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
04033 }
04034 
04035 /// Generate a DAG to grab 256-bits from a 512-bit vector.
04036 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
04037                                    SelectionDAG &DAG, SDLoc dl) {
04038   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
04039   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
04040 }
04041 
04042 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
04043                                unsigned IdxVal, SelectionDAG &DAG,
04044                                SDLoc dl, unsigned vectorWidth) {
04045   assert((vectorWidth == 128 || vectorWidth == 256) &&
04046          "Unsupported vector width");
04047   // Inserting UNDEF is Result
04048   if (Vec.getOpcode() == ISD::UNDEF)
04049     return Result;
04050   EVT VT = Vec.getValueType();
04051   EVT ElVT = VT.getVectorElementType();
04052   EVT ResultVT = Result.getValueType();
04053 
04054   // Insert the relevant vectorWidth bits.
04055   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
04056 
04057   // This is the index of the first element of the vectorWidth-bit chunk
04058   // we want.
04059   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
04060                                * ElemsPerChunk);
04061 
04062   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
04063   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
04064 }
04065 
04066 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
04067 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
04068 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
04069 /// simple superregister reference.  Idx is an index in the 128 bits
04070 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
04071 /// lowering INSERT_VECTOR_ELT operations easier.
04072 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
04073                                   SelectionDAG &DAG, SDLoc dl) {
04074   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
04075 
04076   // For insertion into the zero index (low half) of a 256-bit vector, it is
04077   // more efficient to generate a blend with immediate instead of an insert*128.
04078   // We are still creating an INSERT_SUBVECTOR below with an undef node to
04079   // extend the subvector to the size of the result vector. Make sure that
04080   // we are not recursing on that node by checking for undef here.
04081   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
04082       Result.getOpcode() != ISD::UNDEF) {
04083     EVT ResultVT = Result.getValueType();
04084     SDValue ZeroIndex = DAG.getIntPtrConstant(0);
04085     SDValue Undef = DAG.getUNDEF(ResultVT);
04086     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
04087                                  Vec, ZeroIndex);
04088 
04089     // The blend instruction, and therefore its mask, depend on the data type.
04090     MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
04091     if (ScalarType.isFloatingPoint()) {
04092       // Choose either vblendps (float) or vblendpd (double).
04093       unsigned ScalarSize = ScalarType.getSizeInBits();
04094       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
04095       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
04096       SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
04097       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
04098     }
04099 
04100     const X86Subtarget &Subtarget =
04101     static_cast<const X86Subtarget &>(DAG.getSubtarget());
04102 
04103     // AVX2 is needed for 256-bit integer blend support.
04104     // Integers must be cast to 32-bit because there is only vpblendd;
04105     // vpblendw can't be used for this because it has a handicapped mask.
04106 
04107     // If we don't have AVX2, then cast to float. Using a wrong domain blend
04108     // is still more efficient than using the wrong domain vinsertf128 that
04109     // will be created by InsertSubVector().
04110     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
04111 
04112     SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
04113     Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
04114     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
04115     return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
04116   }
04117 
04118   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
04119 }
04120 
04121 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
04122                                   SelectionDAG &DAG, SDLoc dl) {
04123   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
04124   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
04125 }
04126 
04127 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
04128 /// instructions. This is used because creating CONCAT_VECTOR nodes of
04129 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
04130 /// large BUILD_VECTORS.
04131 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
04132                                    unsigned NumElems, SelectionDAG &DAG,
04133                                    SDLoc dl) {
04134   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
04135   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
04136 }
04137 
04138 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
04139                                    unsigned NumElems, SelectionDAG &DAG,
04140                                    SDLoc dl) {
04141   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
04142   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
04143 }
04144 
04145 /// getOnesVector - Returns a vector of specified type with all bits set.
04146 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04147 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04148 /// Then bitcast to their original type, ensuring they get CSE'd.
04149 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04150                              SDLoc dl) {
04151   assert(VT.isVector() && "Expected a vector type");
04152 
04153   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
04154   SDValue Vec;
04155   if (VT.is256BitVector()) {
04156     if (HasInt256) { // AVX2
04157       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04158       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04159     } else { // AVX
04160       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04161       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04162     }
04163   } else if (VT.is128BitVector()) {
04164     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04165   } else
04166     llvm_unreachable("Unexpected vector type");
04167 
04168   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04169 }
04170 
04171 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04172 /// operation of specified width.
04173 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04174                        SDValue V2) {
04175   unsigned NumElems = VT.getVectorNumElements();
04176   SmallVector<int, 8> Mask;
04177   Mask.push_back(NumElems);
04178   for (unsigned i = 1; i != NumElems; ++i)
04179     Mask.push_back(i);
04180   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04181 }
04182 
04183 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04184 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04185                           SDValue V2) {
04186   unsigned NumElems = VT.getVectorNumElements();
04187   SmallVector<int, 8> Mask;
04188   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04189     Mask.push_back(i);
04190     Mask.push_back(i + NumElems);
04191   }
04192   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04193 }
04194 
04195 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04196 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04197                           SDValue V2) {
04198   unsigned NumElems = VT.getVectorNumElements();
04199   SmallVector<int, 8> Mask;
04200   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04201     Mask.push_back(i + Half);
04202     Mask.push_back(i + NumElems + Half);
04203   }
04204   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04205 }
04206 
04207 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04208 /// vector of zero or undef vector.  This produces a shuffle where the low
04209 /// element of V2 is swizzled into the zero/undef vector, landing at element
04210 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04211 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04212                                            bool IsZero,
04213                                            const X86Subtarget *Subtarget,
04214                                            SelectionDAG &DAG) {
04215   MVT VT = V2.getSimpleValueType();
04216   SDValue V1 = IsZero
04217     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
04218   unsigned NumElems = VT.getVectorNumElements();
04219   SmallVector<int, 16> MaskVec;
04220   for (unsigned i = 0; i != NumElems; ++i)
04221     // If this is the insertion idx, put the low elt of V2 here.
04222     MaskVec.push_back(i == Idx ? NumElems : i);
04223   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
04224 }
04225 
04226 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
04227 /// target specific opcode. Returns true if the Mask could be calculated. Sets
04228 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
04229 /// shuffles which use a single input multiple times, and in those cases it will
04230 /// adjust the mask to only have indices within that single input.
04231 static bool getTargetShuffleMask(SDNode *N, MVT VT,
04232                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
04233   unsigned NumElems = VT.getVectorNumElements();
04234   SDValue ImmN;
04235 
04236   IsUnary = false;
04237   bool IsFakeUnary = false;
04238   switch(N->getOpcode()) {
04239   case X86ISD::BLENDI:
04240     ImmN = N->getOperand(N->getNumOperands()-1);
04241     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04242     break;
04243   case X86ISD::SHUFP:
04244     ImmN = N->getOperand(N->getNumOperands()-1);
04245     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04246     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04247     break;
04248   case X86ISD::UNPCKH:
04249     DecodeUNPCKHMask(VT, Mask);
04250     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04251     break;
04252   case X86ISD::UNPCKL:
04253     DecodeUNPCKLMask(VT, Mask);
04254     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04255     break;
04256   case X86ISD::MOVHLPS:
04257     DecodeMOVHLPSMask(NumElems, Mask);
04258     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04259     break;
04260   case X86ISD::MOVLHPS:
04261     DecodeMOVLHPSMask(NumElems, Mask);
04262     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04263     break;
04264   case X86ISD::PALIGNR:
04265     ImmN = N->getOperand(N->getNumOperands()-1);
04266     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04267     break;
04268   case X86ISD::PSHUFD:
04269   case X86ISD::VPERMILPI:
04270     ImmN = N->getOperand(N->getNumOperands()-1);
04271     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04272     IsUnary = true;
04273     break;
04274   case X86ISD::PSHUFHW:
04275     ImmN = N->getOperand(N->getNumOperands()-1);
04276     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04277     IsUnary = true;
04278     break;
04279   case X86ISD::PSHUFLW:
04280     ImmN = N->getOperand(N->getNumOperands()-1);
04281     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04282     IsUnary = true;
04283     break;
04284   case X86ISD::PSHUFB: {
04285     IsUnary = true;
04286     SDValue MaskNode = N->getOperand(1);
04287     while (MaskNode->getOpcode() == ISD::BITCAST)
04288       MaskNode = MaskNode->getOperand(0);
04289 
04290     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
04291       // If we have a build-vector, then things are easy.
04292       EVT VT = MaskNode.getValueType();
04293       assert(VT.isVector() &&
04294              "Can't produce a non-vector with a build_vector!");
04295       if (!VT.isInteger())
04296         return false;
04297 
04298       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
04299 
04300       SmallVector<uint64_t, 32> RawMask;
04301       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
04302         SDValue Op = MaskNode->getOperand(i);
04303         if (Op->getOpcode() == ISD::UNDEF) {
04304           RawMask.push_back((uint64_t)SM_SentinelUndef);
04305           continue;
04306         }
04307         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
04308         if (!CN)
04309           return false;
04310         APInt MaskElement = CN->getAPIntValue();
04311 
04312         // We now have to decode the element which could be any integer size and
04313         // extract each byte of it.
04314         for (int j = 0; j < NumBytesPerElement; ++j) {
04315           // Note that this is x86 and so always little endian: the low byte is
04316           // the first byte of the mask.
04317           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
04318           MaskElement = MaskElement.lshr(8);
04319         }
04320       }
04321       DecodePSHUFBMask(RawMask, Mask);
04322       break;
04323     }
04324 
04325     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
04326     if (!MaskLoad)
04327       return false;
04328 
04329     SDValue Ptr = MaskLoad->getBasePtr();
04330     if (Ptr->getOpcode() == X86ISD::Wrapper)
04331       Ptr = Ptr->getOperand(0);
04332 
04333     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
04334     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
04335       return false;
04336 
04337     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
04338       DecodePSHUFBMask(C, Mask);
04339       if (Mask.empty())
04340         return false;
04341       break;
04342     }
04343 
04344     return false;
04345   }
04346   case X86ISD::VPERMI:
04347     ImmN = N->getOperand(N->getNumOperands()-1);
04348     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04349     IsUnary = true;
04350     break;
04351   case X86ISD::MOVSS:
04352   case X86ISD::MOVSD:
04353     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
04354     break;
04355   case X86ISD::VPERM2X128:
04356     ImmN = N->getOperand(N->getNumOperands()-1);
04357     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04358     if (Mask.empty()) return false;
04359     break;
04360   case X86ISD::MOVSLDUP:
04361     DecodeMOVSLDUPMask(VT, Mask);
04362     IsUnary = true;
04363     break;
04364   case X86ISD::MOVSHDUP:
04365     DecodeMOVSHDUPMask(VT, Mask);
04366     IsUnary = true;
04367     break;
04368   case X86ISD::MOVDDUP:
04369     DecodeMOVDDUPMask(VT, Mask);
04370     IsUnary = true;
04371     break;
04372   case X86ISD::MOVLHPD:
04373   case X86ISD::MOVLPD:
04374   case X86ISD::MOVLPS:
04375     // Not yet implemented
04376     return false;
04377   default: llvm_unreachable("unknown target shuffle node");
04378   }
04379 
04380   // If we have a fake unary shuffle, the shuffle mask is spread across two
04381   // inputs that are actually the same node. Re-map the mask to always point
04382   // into the first input.
04383   if (IsFakeUnary)
04384     for (int &M : Mask)
04385       if (M >= (int)Mask.size())
04386         M -= Mask.size();
04387 
04388   return true;
04389 }
04390 
04391 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
04392 /// element of the result of the vector shuffle.
04393 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
04394                                    unsigned Depth) {
04395   if (Depth == 6)
04396     return SDValue();  // Limit search depth.
04397 
04398   SDValue V = SDValue(N, 0);
04399   EVT VT = V.getValueType();
04400   unsigned Opcode = V.getOpcode();
04401 
04402   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
04403   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
04404     int Elt = SV->getMaskElt(Index);
04405 
04406     if (Elt < 0)
04407       return DAG.getUNDEF(VT.getVectorElementType());
04408 
04409     unsigned NumElems = VT.getVectorNumElements();
04410     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
04411                                          : SV->getOperand(1);
04412     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
04413   }
04414 
04415   // Recurse into target specific vector shuffles to find scalars.
04416   if (isTargetShuffle(Opcode)) {
04417     MVT ShufVT = V.getSimpleValueType();
04418     unsigned NumElems = ShufVT.getVectorNumElements();
04419     SmallVector<int, 16> ShuffleMask;
04420     bool IsUnary;
04421 
04422     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
04423       return SDValue();
04424 
04425     int Elt = ShuffleMask[Index];
04426     if (Elt < 0)
04427       return DAG.getUNDEF(ShufVT.getVectorElementType());
04428 
04429     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
04430                                          : N->getOperand(1);
04431     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
04432                                Depth+1);
04433   }
04434 
04435   // Actual nodes that may contain scalar elements
04436   if (Opcode == ISD::BITCAST) {
04437     V = V.getOperand(0);
04438     EVT SrcVT = V.getValueType();
04439     unsigned NumElems = VT.getVectorNumElements();
04440 
04441     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
04442       return SDValue();
04443   }
04444 
04445   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
04446     return (Index == 0) ? V.getOperand(0)
04447                         : DAG.getUNDEF(VT.getVectorElementType());
04448 
04449   if (V.getOpcode() == ISD::BUILD_VECTOR)
04450     return V.getOperand(Index);
04451 
04452   return SDValue();
04453 }
04454 
04455 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
04456 ///
04457 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
04458                                        unsigned NumNonZero, unsigned NumZero,
04459                                        SelectionDAG &DAG,
04460                                        const X86Subtarget* Subtarget,
04461                                        const TargetLowering &TLI) {
04462   if (NumNonZero > 8)
04463     return SDValue();
04464 
04465   SDLoc dl(Op);
04466   SDValue V;
04467   bool First = true;
04468 
04469   // SSE4.1 - use PINSRB to insert each byte directly.
04470   if (Subtarget->hasSSE41()) {
04471     for (unsigned i = 0; i < 16; ++i) {
04472       bool isNonZero = (NonZeros & (1 << i)) != 0;
04473       if (isNonZero) {
04474         if (First) {
04475           if (NumZero)
04476             V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
04477           else
04478             V = DAG.getUNDEF(MVT::v16i8);
04479           First = false;
04480         }
04481         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
04482                         MVT::v16i8, V, Op.getOperand(i),
04483                         DAG.getIntPtrConstant(i));
04484       }
04485     }
04486 
04487     return V;
04488   }
04489 
04490   // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
04491   for (unsigned i = 0; i < 16; ++i) {
04492     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
04493     if (ThisIsNonZero && First) {
04494       if (NumZero)
04495         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04496       else
04497         V = DAG.getUNDEF(MVT::v8i16);
04498       First = false;
04499     }
04500 
04501     if ((i & 1) != 0) {
04502       SDValue ThisElt, LastElt;
04503       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
04504       if (LastIsNonZero) {
04505         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
04506                               MVT::i16, Op.getOperand(i-1));
04507       }
04508       if (ThisIsNonZero) {
04509         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
04510         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
04511                               ThisElt, DAG.getConstant(8, MVT::i8));
04512         if (LastIsNonZero)
04513           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
04514       } else
04515         ThisElt = LastElt;
04516 
04517       if (ThisElt.getNode())
04518         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
04519                         DAG.getIntPtrConstant(i/2));
04520     }
04521   }
04522 
04523   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
04524 }
04525 
04526 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
04527 ///
04528 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
04529                                      unsigned NumNonZero, unsigned NumZero,
04530                                      SelectionDAG &DAG,
04531                                      const X86Subtarget* Subtarget,
04532                                      const TargetLowering &TLI) {
04533   if (NumNonZero > 4)
04534     return SDValue();
04535 
04536   SDLoc dl(Op);
04537   SDValue V;
04538   bool First = true;
04539   for (unsigned i = 0; i < 8; ++i) {
04540     bool isNonZero = (NonZeros & (1 << i)) != 0;
04541     if (isNonZero) {
04542       if (First) {
04543         if (NumZero)
04544           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04545         else
04546           V = DAG.getUNDEF(MVT::v8i16);
04547         First = false;
04548       }
04549       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
04550                       MVT::v8i16, V, Op.getOperand(i),
04551                       DAG.getIntPtrConstant(i));
04552     }
04553   }
04554 
04555   return V;
04556 }
04557 
04558 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
04559 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
04560                                      const X86Subtarget *Subtarget,
04561                                      const TargetLowering &TLI) {
04562   // Find all zeroable elements.
04563   std::bitset<4> Zeroable;
04564   for (int i=0; i < 4; ++i) {
04565     SDValue Elt = Op->getOperand(i);
04566     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
04567   }
04568   assert(Zeroable.size() - Zeroable.count() > 1 &&
04569          "We expect at least two non-zero elements!");
04570 
04571   // We only know how to deal with build_vector nodes where elements are either
04572   // zeroable or extract_vector_elt with constant index.
04573   SDValue FirstNonZero;
04574   unsigned FirstNonZeroIdx;
04575   for (unsigned i=0; i < 4; ++i) {
04576     if (Zeroable[i])
04577       continue;
04578     SDValue Elt = Op->getOperand(i);
04579     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
04580         !isa<ConstantSDNode>(Elt.getOperand(1)))
04581       return SDValue();
04582     // Make sure that this node is extracting from a 128-bit vector.
04583     MVT VT = Elt.getOperand(0).getSimpleValueType();
04584     if (!VT.is128BitVector())
04585       return SDValue();
04586     if (!FirstNonZero.getNode()) {
04587       FirstNonZero = Elt;
04588       FirstNonZeroIdx = i;
04589     }
04590   }
04591 
04592   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
04593   SDValue V1 = FirstNonZero.getOperand(0);
04594   MVT VT = V1.getSimpleValueType();
04595 
04596   // See if this build_vector can be lowered as a blend with zero.
04597   SDValue Elt;
04598   unsigned EltMaskIdx, EltIdx;
04599   int Mask[4];
04600   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
04601     if (Zeroable[EltIdx]) {
04602       // The zero vector will be on the right hand side.
04603       Mask[EltIdx] = EltIdx+4;
04604       continue;
04605     }
04606 
04607     Elt = Op->getOperand(EltIdx);
04608     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
04609     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
04610     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
04611       break;
04612     Mask[EltIdx] = EltIdx;
04613   }
04614 
04615   if (EltIdx == 4) {
04616     // Let the shuffle legalizer deal with blend operations.
04617     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
04618     if (V1.getSimpleValueType() != VT)
04619       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
04620     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
04621   }
04622 
04623   // See if we can lower this build_vector to a INSERTPS.
04624   if (!Subtarget->hasSSE41())
04625     return SDValue();
04626 
04627   SDValue V2 = Elt.getOperand(0);
04628   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
04629     V1 = SDValue();
04630 
04631   bool CanFold = true;
04632   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
04633     if (Zeroable[i])
04634       continue;
04635 
04636     SDValue Current = Op->getOperand(i);
04637     SDValue SrcVector = Current->getOperand(0);
04638     if (!V1.getNode())
04639       V1 = SrcVector;
04640     CanFold = SrcVector == V1 &&
04641       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
04642   }
04643 
04644   if (!CanFold)
04645     return SDValue();
04646 
04647   assert(V1.getNode() && "Expected at least two non-zero elements!");
04648   if (V1.getSimpleValueType() != MVT::v4f32)
04649     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
04650   if (V2.getSimpleValueType() != MVT::v4f32)
04651     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
04652 
04653   // Ok, we can emit an INSERTPS instruction.
04654   unsigned ZMask = Zeroable.to_ulong();
04655 
04656   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
04657   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
04658   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
04659                                DAG.getIntPtrConstant(InsertPSMask));
04660   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
04661 }
04662 
04663 /// Return a vector logical shift node.
04664 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
04665                          unsigned NumBits, SelectionDAG &DAG,
04666                          const TargetLowering &TLI, SDLoc dl) {
04667   assert(VT.is128BitVector() && "Unknown type for VShift");
04668   MVT ShVT = MVT::v2i64;
04669   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
04670   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
04671   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
04672   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
04673   SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
04674   return DAG.getNode(ISD::BITCAST, dl, VT,
04675                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
04676 }
04677 
04678 static SDValue
04679 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
04680 
04681   // Check if the scalar load can be widened into a vector load. And if
04682   // the address is "base + cst" see if the cst can be "absorbed" into
04683   // the shuffle mask.
04684   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
04685     SDValue Ptr = LD->getBasePtr();
04686     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
04687       return SDValue();
04688     EVT PVT = LD->getValueType(0);
04689     if (PVT != MVT::i32 && PVT != MVT::f32)
04690       return SDValue();
04691 
04692     int FI = -1;
04693     int64_t Offset = 0;
04694     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
04695       FI = FINode->getIndex();
04696       Offset = 0;
04697     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
04698                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
04699       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
04700       Offset = Ptr.getConstantOperandVal(1);
04701       Ptr = Ptr.getOperand(0);
04702     } else {
04703       return SDValue();
04704     }
04705 
04706     // FIXME: 256-bit vector instructions don't require a strict alignment,
04707     // improve this code to support it better.
04708     unsigned RequiredAlign = VT.getSizeInBits()/8;
04709     SDValue Chain = LD->getChain();
04710     // Make sure the stack object alignment is at least 16 or 32.
04711     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
04712     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
04713       if (MFI->isFixedObjectIndex(FI)) {
04714         // Can't change the alignment. FIXME: It's possible to compute
04715         // the exact stack offset and reference FI + adjust offset instead.
04716         // If someone *really* cares about this. That's the way to implement it.
04717         return SDValue();
04718       } else {
04719         MFI->setObjectAlignment(FI, RequiredAlign);
04720       }
04721     }
04722 
04723     // (Offset % 16 or 32) must be multiple of 4. Then address is then
04724     // Ptr + (Offset & ~15).
04725     if (Offset < 0)
04726       return SDValue();
04727     if ((Offset % RequiredAlign) & 3)
04728       return SDValue();
04729     int64_t StartOffset = Offset & ~(RequiredAlign-1);
04730     if (StartOffset)
04731       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
04732                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
04733 
04734     int EltNo = (Offset - StartOffset) >> 2;
04735     unsigned NumElems = VT.getVectorNumElements();
04736 
04737     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
04738     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
04739                              LD->getPointerInfo().getWithOffset(StartOffset),
04740                              false, false, false, 0);
04741 
04742     SmallVector<int, 8> Mask(NumElems, EltNo);
04743 
04744     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
04745   }
04746 
04747   return SDValue();
04748 }
04749 
04750 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
04751 /// elements can be replaced by a single large load which has the same value as
04752 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
04753 ///
04754 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
04755 ///
04756 /// FIXME: we'd also like to handle the case where the last elements are zero
04757 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
04758 /// There's even a handy isZeroNode for that purpose.
04759 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
04760                                         SDLoc &DL, SelectionDAG &DAG,
04761                                         bool isAfterLegalize) {
04762   unsigned NumElems = Elts.size();
04763 
04764   LoadSDNode *LDBase = nullptr;
04765   unsigned LastLoadedElt = -1U;
04766 
04767   // For each element in the initializer, see if we've found a load or an undef.
04768   // If we don't find an initial load element, or later load elements are
04769   // non-consecutive, bail out.
04770   for (unsigned i = 0; i < NumElems; ++i) {
04771     SDValue Elt = Elts[i];
04772     // Look through a bitcast.
04773     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
04774       Elt = Elt.getOperand(0);
04775     if (!Elt.getNode() ||
04776         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
04777       return SDValue();
04778     if (!LDBase) {
04779       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
04780         return SDValue();
04781       LDBase = cast<LoadSDNode>(Elt.getNode());
04782       LastLoadedElt = i;
04783       continue;
04784     }
04785     if (Elt.getOpcode() == ISD::UNDEF)
04786       continue;
04787 
04788     LoadSDNode *LD = cast<LoadSDNode>(Elt);
04789     EVT LdVT = Elt.getValueType();
04790     // Each loaded element must be the correct fractional portion of the
04791     // requested vector load.
04792     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
04793       return SDValue();
04794     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
04795       return SDValue();
04796     LastLoadedElt = i;
04797   }
04798 
04799   // If we have found an entire vector of loads and undefs, then return a large
04800   // load of the entire vector width starting at the base pointer.  If we found
04801   // consecutive loads for the low half, generate a vzext_load node.
04802   if (LastLoadedElt == NumElems - 1) {
04803     assert(LDBase && "Did not find base load for merging consecutive loads");
04804     EVT EltVT = LDBase->getValueType(0);
04805     // Ensure that the input vector size for the merged loads matches the
04806     // cumulative size of the input elements.
04807     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
04808       return SDValue();
04809 
04810     if (isAfterLegalize &&
04811         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
04812       return SDValue();
04813 
04814     SDValue NewLd = SDValue();
04815 
04816     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
04817                         LDBase->getPointerInfo(), LDBase->isVolatile(),
04818                         LDBase->isNonTemporal(), LDBase->isInvariant(),
04819                         LDBase->getAlignment());
04820 
04821     if (LDBase->hasAnyUseOfValue(1)) {
04822       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04823                                      SDValue(LDBase, 1),
04824                                      SDValue(NewLd.getNode(), 1));
04825       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04826       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04827                              SDValue(NewLd.getNode(), 1));
04828     }
04829 
04830     return NewLd;
04831   }
04832 
04833   //TODO: The code below fires only for for loading the low v2i32 / v2f32
04834   //of a v4i32 / v4f32. It's probably worth generalizing.
04835   EVT EltVT = VT.getVectorElementType();
04836   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
04837       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
04838     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
04839     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
04840     SDValue ResNode =
04841         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
04842                                 LDBase->getPointerInfo(),
04843                                 LDBase->getAlignment(),
04844                                 false/*isVolatile*/, true/*ReadMem*/,
04845                                 false/*WriteMem*/);
04846 
04847     // Make sure the newly-created LOAD is in the same position as LDBase in
04848     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
04849     // update uses of LDBase's output chain to use the TokenFactor.
04850     if (LDBase->hasAnyUseOfValue(1)) {
04851       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04852                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
04853       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04854       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04855                              SDValue(ResNode.getNode(), 1));
04856     }
04857 
04858     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
04859   }
04860   return SDValue();
04861 }
04862 
04863 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
04864 /// to generate a splat value for the following cases:
04865 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
04866 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
04867 /// a scalar load, or a constant.
04868 /// The VBROADCAST node is returned when a pattern is found,
04869 /// or SDValue() otherwise.
04870 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
04871                                     SelectionDAG &DAG) {
04872   // VBROADCAST requires AVX.
04873   // TODO: Splats could be generated for non-AVX CPUs using SSE
04874   // instructions, but there's less potential gain for only 128-bit vectors.
04875   if (!Subtarget->hasAVX())
04876     return SDValue();
04877 
04878   MVT VT = Op.getSimpleValueType();
04879   SDLoc dl(Op);
04880 
04881   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
04882          "Unsupported vector type for broadcast.");
04883 
04884   SDValue Ld;
04885   bool ConstSplatVal;
04886 
04887   switch (Op.getOpcode()) {
04888     default:
04889       // Unknown pattern found.
04890       return SDValue();
04891 
04892     case ISD::BUILD_VECTOR: {
04893       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
04894       BitVector UndefElements;
04895       SDValue Splat = BVOp->getSplatValue(&UndefElements);
04896 
04897       // We need a splat of a single value to use broadcast, and it doesn't
04898       // make any sense if the value is only in one element of the vector.
04899       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
04900         return SDValue();
04901 
04902       Ld = Splat;
04903       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04904                        Ld.getOpcode() == ISD::ConstantFP);
04905 
04906       // Make sure that all of the users of a non-constant load are from the
04907       // BUILD_VECTOR node.
04908       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
04909         return SDValue();
04910       break;
04911     }
04912 
04913     case ISD::VECTOR_SHUFFLE: {
04914       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
04915 
04916       // Shuffles must have a splat mask where the first element is
04917       // broadcasted.
04918       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
04919         return SDValue();
04920 
04921       SDValue Sc = Op.getOperand(0);
04922       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
04923           Sc.getOpcode() != ISD::BUILD_VECTOR) {
04924 
04925         if (!Subtarget->hasInt256())
04926           return SDValue();
04927 
04928         // Use the register form of the broadcast instruction available on AVX2.
04929         if (VT.getSizeInBits() >= 256)
04930           Sc = Extract128BitVector(Sc, 0, DAG, dl);
04931         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
04932       }
04933 
04934       Ld = Sc.getOperand(0);
04935       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04936                        Ld.getOpcode() == ISD::ConstantFP);
04937 
04938       // The scalar_to_vector node and the suspected
04939       // load node must have exactly one user.
04940       // Constants may have multiple users.
04941 
04942       // AVX-512 has register version of the broadcast
04943       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
04944         Ld.getValueType().getSizeInBits() >= 32;
04945       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
04946           !hasRegVer))
04947         return SDValue();
04948       break;
04949     }
04950   }
04951 
04952   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
04953   bool IsGE256 = (VT.getSizeInBits() >= 256);
04954 
04955   // When optimizing for size, generate up to 5 extra bytes for a broadcast
04956   // instruction to save 8 or more bytes of constant pool data.
04957   // TODO: If multiple splats are generated to load the same constant,
04958   // it may be detrimental to overall size. There needs to be a way to detect
04959   // that condition to know if this is truly a size win.
04960   const Function *F = DAG.getMachineFunction().getFunction();
04961   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
04962 
04963   // Handle broadcasting a single constant scalar from the constant pool
04964   // into a vector.
04965   // On Sandybridge (no AVX2), it is still better to load a constant vector
04966   // from the constant pool and not to broadcast it from a scalar.
04967   // But override that restriction when optimizing for size.
04968   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
04969   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
04970     EVT CVT = Ld.getValueType();
04971     assert(!CVT.isVector() && "Must not broadcast a vector type");
04972 
04973     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
04974     // For size optimization, also splat v2f64 and v2i64, and for size opt
04975     // with AVX2, also splat i8 and i16.
04976     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
04977     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
04978         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
04979       const Constant *C = nullptr;
04980       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
04981         C = CI->getConstantIntValue();
04982       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
04983         C = CF->getConstantFPValue();
04984 
04985       assert(C && "Invalid constant type");
04986 
04987       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
04988       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
04989       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
04990       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
04991                        MachinePointerInfo::getConstantPool(),
04992                        false, false, false, Alignment);
04993 
04994       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04995     }
04996   }
04997 
04998   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
04999 
05000   // Handle AVX2 in-register broadcasts.
05001   if (!IsLoad && Subtarget->hasInt256() &&
05002       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
05003     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05004 
05005   // The scalar source must be a normal load.
05006   if (!IsLoad)
05007     return SDValue();
05008 
05009   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
05010       (Subtarget->hasVLX() && ScalarSize == 64))
05011     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05012 
05013   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
05014   // double since there is no vbroadcastsd xmm
05015   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
05016     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
05017       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
05018   }
05019 
05020   // Unsupported broadcast.
05021   return SDValue();
05022 }
05023 
05024 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
05025 /// underlying vector and index.
05026 ///
05027 /// Modifies \p ExtractedFromVec to the real vector and returns the real
05028 /// index.
05029 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
05030                                          SDValue ExtIdx) {
05031   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
05032   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
05033     return Idx;
05034 
05035   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
05036   // lowered this:
05037   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
05038   // to:
05039   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
05040   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
05041   //                           undef)
05042   //                       Constant<0>)
05043   // In this case the vector is the extract_subvector expression and the index
05044   // is 2, as specified by the shuffle.
05045   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
05046   SDValue ShuffleVec = SVOp->getOperand(0);
05047   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
05048   assert(ShuffleVecVT.getVectorElementType() ==
05049          ExtractedFromVec.getSimpleValueType().getVectorElementType());
05050 
05051   int ShuffleIdx = SVOp->getMaskElt(Idx);
05052   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
05053     ExtractedFromVec = ShuffleVec;
05054     return ShuffleIdx;
05055   }
05056   return Idx;
05057 }
05058 
05059 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
05060   MVT VT = Op.getSimpleValueType();
05061 
05062   // Skip if insert_vec_elt is not supported.
05063   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
05064   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
05065     return SDValue();
05066 
05067   SDLoc DL(Op);
05068   unsigned NumElems = Op.getNumOperands();
05069 
05070   SDValue VecIn1;
05071   SDValue VecIn2;
05072   SmallVector<unsigned, 4> InsertIndices;
05073   SmallVector<int, 8> Mask(NumElems, -1);
05074 
05075   for (unsigned i = 0; i != NumElems; ++i) {
05076     unsigned Opc = Op.getOperand(i).getOpcode();
05077 
05078     if (Opc == ISD::UNDEF)
05079       continue;
05080 
05081     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05082       // Quit if more than 1 elements need inserting.
05083       if (InsertIndices.size() > 1)
05084         return SDValue();
05085 
05086       InsertIndices.push_back(i);
05087       continue;
05088     }
05089 
05090     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05091     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05092     // Quit if non-constant index.
05093     if (!isa<ConstantSDNode>(ExtIdx))
05094       return SDValue();
05095     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05096 
05097     // Quit if extracted from vector of different type.
05098     if (ExtractedFromVec.getValueType() != VT)
05099       return SDValue();
05100 
05101     if (!VecIn1.getNode())
05102       VecIn1 = ExtractedFromVec;
05103     else if (VecIn1 != ExtractedFromVec) {
05104       if (!VecIn2.getNode())
05105         VecIn2 = ExtractedFromVec;
05106       else if (VecIn2 != ExtractedFromVec)
05107         // Quit if more than 2 vectors to shuffle
05108         return SDValue();
05109     }
05110 
05111     if (ExtractedFromVec == VecIn1)
05112       Mask[i] = Idx;
05113     else if (ExtractedFromVec == VecIn2)
05114       Mask[i] = Idx + NumElems;
05115   }
05116 
05117   if (!VecIn1.getNode())
05118     return SDValue();
05119 
05120   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05121   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05122   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05123     unsigned Idx = InsertIndices[i];
05124     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05125                      DAG.getIntPtrConstant(Idx));
05126   }
05127 
05128   return NV;
05129 }
05130 
05131 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05132 SDValue
05133 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05134 
05135   MVT VT = Op.getSimpleValueType();
05136   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
05137          "Unexpected type in LowerBUILD_VECTORvXi1!");
05138 
05139   SDLoc dl(Op);
05140   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05141     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05142     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05143     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05144   }
05145 
05146   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
05147     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
05148     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05149     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05150   }
05151 
05152   bool AllContants = true;
05153   uint64_t Immediate = 0;
05154   int NonConstIdx = -1;
05155   bool IsSplat = true;
05156   unsigned NumNonConsts = 0;
05157   unsigned NumConsts = 0;
05158   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
05159     SDValue In = Op.getOperand(idx);
05160     if (In.getOpcode() == ISD::UNDEF)
05161       continue;
05162     if (!isa<ConstantSDNode>(In)) {
05163       AllContants = false;
05164       NonConstIdx = idx;
05165       NumNonConsts++;
05166     } else {
05167       NumConsts++;
05168       if (cast<ConstantSDNode>(In)->getZExtValue())
05169       Immediate |= (1ULL << idx);
05170     }
05171     if (In != Op.getOperand(0))
05172       IsSplat = false;
05173   }
05174 
05175   if (AllContants) {
05176     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
05177       DAG.getConstant(Immediate, MVT::i16));
05178     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
05179                        DAG.getIntPtrConstant(0));
05180   }
05181 
05182   if (NumNonConsts == 1 && NonConstIdx != 0) {
05183     SDValue DstVec;
05184     if (NumConsts) {
05185       SDValue VecAsImm = DAG.getConstant(Immediate,
05186                                          MVT::getIntegerVT(VT.getSizeInBits()));
05187       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
05188     }
05189     else
05190       DstVec = DAG.getUNDEF(VT);
05191     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05192                        Op.getOperand(NonConstIdx),
05193                        DAG.getIntPtrConstant(NonConstIdx));
05194   }
05195   if (!IsSplat && (NonConstIdx != 0))
05196     llvm_unreachable("Unsupported BUILD_VECTOR operation");
05197   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
05198   SDValue Select;
05199   if (IsSplat)
05200     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05201                           DAG.getConstant(-1, SelectVT),
05202                           DAG.getConstant(0, SelectVT));
05203   else
05204     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05205                          DAG.getConstant((Immediate | 1), SelectVT),
05206                          DAG.getConstant(Immediate, SelectVT));
05207   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
05208 }
05209 
05210 /// \brief Return true if \p N implements a horizontal binop and return the
05211 /// operands for the horizontal binop into V0 and V1.
05212 ///
05213 /// This is a helper function of LowerToHorizontalOp().
05214 /// This function checks that the build_vector \p N in input implements a
05215 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
05216 /// operation to match.
05217 /// For example, if \p Opcode is equal to ISD::ADD, then this function
05218 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
05219 /// is equal to ISD::SUB, then this function checks if this is a horizontal
05220 /// arithmetic sub.
05221 ///
05222 /// This function only analyzes elements of \p N whose indices are
05223 /// in range [BaseIdx, LastIdx).
05224 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
05225                               SelectionDAG &DAG,
05226                               unsigned BaseIdx, unsigned LastIdx,
05227                               SDValue &V0, SDValue &V1) {
05228   EVT VT = N->getValueType(0);
05229 
05230   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
05231   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
05232          "Invalid Vector in input!");
05233 
05234   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
05235   bool CanFold = true;
05236   unsigned ExpectedVExtractIdx = BaseIdx;
05237   unsigned NumElts = LastIdx - BaseIdx;
05238   V0 = DAG.getUNDEF(VT);
05239   V1 = DAG.getUNDEF(VT);
05240 
05241   // Check if N implements a horizontal binop.
05242   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
05243     SDValue Op = N->getOperand(i + BaseIdx);
05244 
05245     // Skip UNDEFs.
05246     if (Op->getOpcode() == ISD::UNDEF) {
05247       // Update the expected vector extract index.
05248       if (i * 2 == NumElts)
05249         ExpectedVExtractIdx = BaseIdx;
05250       ExpectedVExtractIdx += 2;
05251       continue;
05252     }
05253 
05254     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
05255 
05256     if (!CanFold)
05257       break;
05258 
05259     SDValue Op0 = Op.getOperand(0);
05260     SDValue Op1 = Op.getOperand(1);
05261 
05262     // Try to match the following pattern:
05263     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
05264     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05265         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05266         Op0.getOperand(0) == Op1.getOperand(0) &&
05267         isa<ConstantSDNode>(Op0.getOperand(1)) &&
05268         isa<ConstantSDNode>(Op1.getOperand(1)));
05269     if (!CanFold)
05270       break;
05271 
05272     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05273     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
05274 
05275     if (i * 2 < NumElts) {
05276       if (V0.getOpcode() == ISD::UNDEF) {
05277         V0 = Op0.getOperand(0);
05278         if (V0.getValueType() != VT)
05279           return false;
05280       }
05281     } else {
05282       if (V1.getOpcode() == ISD::UNDEF) {
05283         V1 = Op0.getOperand(0);
05284         if (V1.getValueType() != VT)
05285           return false;
05286       }
05287       if (i * 2 == NumElts)
05288         ExpectedVExtractIdx = BaseIdx;
05289     }
05290 
05291     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
05292     if (I0 == ExpectedVExtractIdx)
05293       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
05294     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
05295       // Try to match the following dag sequence:
05296       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
05297       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
05298     } else
05299       CanFold = false;
05300 
05301     ExpectedVExtractIdx += 2;
05302   }
05303 
05304   return CanFold;
05305 }
05306 
05307 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
05308 /// a concat_vector.
05309 ///
05310 /// This is a helper function of LowerToHorizontalOp().
05311 /// This function expects two 256-bit vectors called V0 and V1.
05312 /// At first, each vector is split into two separate 128-bit vectors.
05313 /// Then, the resulting 128-bit vectors are used to implement two
05314 /// horizontal binary operations.
05315 ///
05316 /// The kind of horizontal binary operation is defined by \p X86Opcode.
05317 ///
05318 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
05319 /// the two new horizontal binop.
05320 /// When Mode is set, the first horizontal binop dag node would take as input
05321 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
05322 /// horizontal binop dag node would take as input the lower 128-bit of V1
05323 /// and the upper 128-bit of V1.
05324 ///   Example:
05325 ///     HADD V0_LO, V0_HI
05326 ///     HADD V1_LO, V1_HI
05327 ///
05328 /// Otherwise, the first horizontal binop dag node takes as input the lower
05329 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
05330 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
05331 ///   Example:
05332 ///     HADD V0_LO, V1_LO
05333 ///     HADD V0_HI, V1_HI
05334 ///
05335 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
05336 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
05337 /// the upper 128-bits of the result.
05338 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
05339                                      SDLoc DL, SelectionDAG &DAG,
05340                                      unsigned X86Opcode, bool Mode,
05341                                      bool isUndefLO, bool isUndefHI) {
05342   EVT VT = V0.getValueType();
05343   assert(VT.is256BitVector() && VT == V1.getValueType() &&
05344          "Invalid nodes in input!");
05345 
05346   unsigned NumElts = VT.getVectorNumElements();
05347   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
05348   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
05349   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
05350   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
05351   EVT NewVT = V0_LO.getValueType();
05352 
05353   SDValue LO = DAG.getUNDEF(NewVT);
05354   SDValue HI = DAG.getUNDEF(NewVT);
05355 
05356   if (Mode) {
05357     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05358     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
05359       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
05360     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
05361       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
05362   } else {
05363     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05364     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
05365                        V1_LO->getOpcode() != ISD::UNDEF))
05366       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
05367 
05368     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
05369                        V1_HI->getOpcode() != ISD::UNDEF))
05370       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
05371   }
05372 
05373   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
05374 }
05375 
05376 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
05377 /// node.
05378 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
05379                              const X86Subtarget *Subtarget, SelectionDAG &DAG) {
05380   EVT VT = BV->getValueType(0);
05381   if ((!Subtarget->hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
05382       (!Subtarget->hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
05383     return SDValue();
05384 
05385   SDLoc DL(BV);
05386   unsigned NumElts = VT.getVectorNumElements();
05387   SDValue InVec0 = DAG.getUNDEF(VT);
05388   SDValue InVec1 = DAG.getUNDEF(VT);
05389 
05390   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
05391           VT == MVT::v2f64) && "build_vector with an invalid type found!");
05392 
05393   // Odd-numbered elements in the input build vector are obtained from
05394   // adding two integer/float elements.
05395   // Even-numbered elements in the input build vector are obtained from
05396   // subtracting two integer/float elements.
05397   unsigned ExpectedOpcode = ISD::FSUB;
05398   unsigned NextExpectedOpcode = ISD::FADD;
05399   bool AddFound = false;
05400   bool SubFound = false;
05401 
05402   for (unsigned i = 0, e = NumElts; i != e; ++i) {
05403     SDValue Op = BV->getOperand(i);
05404 
05405     // Skip 'undef' values.
05406     unsigned Opcode = Op.getOpcode();
05407     if (Opcode == ISD::UNDEF) {
05408       std::swap(ExpectedOpcode, NextExpectedOpcode);
05409       continue;
05410     }
05411 
05412     // Early exit if we found an unexpected opcode.
05413     if (Opcode != ExpectedOpcode)
05414       return SDValue();
05415 
05416     SDValue Op0 = Op.getOperand(0);
05417     SDValue Op1 = Op.getOperand(1);
05418 
05419     // Try to match the following pattern:
05420     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
05421     // Early exit if we cannot match that sequence.
05422     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05423         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05424         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
05425         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
05426         Op0.getOperand(1) != Op1.getOperand(1))
05427       return SDValue();
05428 
05429     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05430     if (I0 != i)
05431       return SDValue();
05432 
05433     // We found a valid add/sub node. Update the information accordingly.
05434     if (i & 1)
05435       AddFound = true;
05436     else
05437       SubFound = true;
05438 
05439     // Update InVec0 and InVec1.
05440     if (InVec0.getOpcode() == ISD::UNDEF) {
05441       InVec0 = Op0.getOperand(0);
05442       if (InVec0.getValueType() != VT)
05443         return SDValue();
05444     }
05445     if (InVec1.getOpcode() == ISD::UNDEF) {
05446       InVec1 = Op1.getOperand(0);
05447       if (InVec1.getValueType() != VT)
05448         return SDValue();
05449     }
05450 
05451     // Make sure that operands in input to each add/sub node always
05452     // come from a same pair of vectors.
05453     if (InVec0 != Op0.getOperand(0)) {
05454       if (ExpectedOpcode == ISD::FSUB)
05455         return SDValue();
05456 
05457       // FADD is commutable. Try to commute the operands
05458       // and then test again.
05459       std::swap(Op0, Op1);
05460       if (InVec0 != Op0.getOperand(0))
05461         return SDValue();
05462     }
05463 
05464     if (InVec1 != Op1.getOperand(0))
05465       return SDValue();
05466 
05467     // Update the pair of expected opcodes.
05468     std::swap(ExpectedOpcode, NextExpectedOpcode);
05469   }
05470 
05471   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
05472   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
05473       InVec1.getOpcode() != ISD::UNDEF)
05474     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
05475 
05476   return SDValue();
05477 }
05478 
05479 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
05480 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
05481                                    const X86Subtarget *Subtarget,
05482                                    SelectionDAG &DAG) {
05483   EVT VT = BV->getValueType(0);
05484   unsigned NumElts = VT.getVectorNumElements();
05485   unsigned NumUndefsLO = 0;
05486   unsigned NumUndefsHI = 0;
05487   unsigned Half = NumElts/2;
05488 
05489   // Count the number of UNDEF operands in the build_vector in input.
05490   for (unsigned i = 0, e = Half; i != e; ++i)
05491     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05492       NumUndefsLO++;
05493 
05494   for (unsigned i = Half, e = NumElts; i != e; ++i)
05495     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05496       NumUndefsHI++;
05497 
05498   // Early exit if this is either a build_vector of all UNDEFs or all the
05499   // operands but one are UNDEF.
05500   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
05501     return SDValue();
05502 
05503   SDLoc DL(BV);
05504   SDValue InVec0, InVec1;
05505   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
05506     // Try to match an SSE3 float HADD/HSUB.
05507     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05508       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05509 
05510     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05511       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05512   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
05513     // Try to match an SSSE3 integer HADD/HSUB.
05514     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05515       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
05516 
05517     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05518       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
05519   }
05520 
05521   if (!Subtarget->hasAVX())
05522     return SDValue();
05523 
05524   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
05525     // Try to match an AVX horizontal add/sub of packed single/double
05526     // precision floating point values from 256-bit vectors.
05527     SDValue InVec2, InVec3;
05528     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
05529         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
05530         ((InVec0.getOpcode() == ISD::UNDEF ||
05531           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05532         ((InVec1.getOpcode() == ISD::UNDEF ||
05533           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05534       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05535 
05536     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
05537         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
05538         ((InVec0.getOpcode() == ISD::UNDEF ||
05539           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05540         ((InVec1.getOpcode() == ISD::UNDEF ||
05541           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05542       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05543   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
05544     // Try to match an AVX2 horizontal add/sub of signed integers.
05545     SDValue InVec2, InVec3;
05546     unsigned X86Opcode;
05547     bool CanFold = true;
05548 
05549     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
05550         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
05551         ((InVec0.getOpcode() == ISD::UNDEF ||
05552           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05553         ((InVec1.getOpcode() == ISD::UNDEF ||
05554           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05555       X86Opcode = X86ISD::HADD;
05556     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
05557         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
05558         ((InVec0.getOpcode() == ISD::UNDEF ||
05559           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05560         ((InVec1.getOpcode() == ISD::UNDEF ||
05561           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05562       X86Opcode = X86ISD::HSUB;
05563     else
05564       CanFold = false;
05565 
05566     if (CanFold) {
05567       // Fold this build_vector into a single horizontal add/sub.
05568       // Do this only if the target has AVX2.
05569       if (Subtarget->hasAVX2())
05570         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
05571 
05572       // Do not try to expand this build_vector into a pair of horizontal
05573       // add/sub if we can emit a pair of scalar add/sub.
05574       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05575         return SDValue();
05576 
05577       // Convert this build_vector into a pair of horizontal binop followed by
05578       // a concat vector.
05579       bool isUndefLO = NumUndefsLO == Half;
05580       bool isUndefHI = NumUndefsHI == Half;
05581       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
05582                                    isUndefLO, isUndefHI);
05583     }
05584   }
05585 
05586   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
05587        VT == MVT::v16i16) && Subtarget->hasAVX()) {
05588     unsigned X86Opcode;
05589     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05590       X86Opcode = X86ISD::HADD;
05591     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05592       X86Opcode = X86ISD::HSUB;
05593     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05594       X86Opcode = X86ISD::FHADD;
05595     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05596       X86Opcode = X86ISD::FHSUB;
05597     else
05598       return SDValue();
05599 
05600     // Don't try to expand this build_vector into a pair of horizontal add/sub
05601     // if we can simply emit a pair of scalar add/sub.
05602     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05603       return SDValue();
05604 
05605     // Convert this build_vector into two horizontal add/sub followed by
05606     // a concat vector.
05607     bool isUndefLO = NumUndefsLO == Half;
05608     bool isUndefHI = NumUndefsHI == Half;
05609     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
05610                                  isUndefLO, isUndefHI);
05611   }
05612 
05613   return SDValue();
05614 }
05615 
05616 SDValue
05617 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05618   SDLoc dl(Op);
05619 
05620   MVT VT = Op.getSimpleValueType();
05621   MVT ExtVT = VT.getVectorElementType();
05622   unsigned NumElems = Op.getNumOperands();
05623 
05624   // Generate vectors for predicate vectors.
05625   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
05626     return LowerBUILD_VECTORvXi1(Op, DAG);
05627 
05628   // Vectors containing all zeros can be matched by pxor and xorps later
05629   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05630     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05631     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05632     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
05633       return Op;
05634 
05635     return getZeroVector(VT, Subtarget, DAG, dl);
05636   }
05637 
05638   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05639   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05640   // vpcmpeqd on 256-bit vectors.
05641   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05642     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05643       return Op;
05644 
05645     if (!VT.is512BitVector())
05646       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05647   }
05648 
05649   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
05650   if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
05651     return AddSub;
05652   if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
05653     return HorizontalOp;
05654   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
05655     return Broadcast;
05656 
05657   unsigned EVTBits = ExtVT.getSizeInBits();
05658 
05659   unsigned NumZero  = 0;
05660   unsigned NumNonZero = 0;
05661   unsigned NonZeros = 0;
05662   bool IsAllConstants = true;
05663   SmallSet<SDValue, 8> Values;
05664   for (unsigned i = 0; i < NumElems; ++i) {
05665     SDValue Elt = Op.getOperand(i);
05666     if (Elt.getOpcode() == ISD::UNDEF)
05667       continue;
05668     Values.insert(Elt);
05669     if (Elt.getOpcode() != ISD::Constant &&
05670         Elt.getOpcode() != ISD::ConstantFP)
05671       IsAllConstants = false;
05672     if (X86::isZeroNode(Elt))
05673       NumZero++;
05674     else {
05675       NonZeros |= (1 << i);
05676       NumNonZero++;
05677     }
05678   }
05679 
05680   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05681   if (NumNonZero == 0)
05682     return DAG.getUNDEF(VT);
05683 
05684   // Special case for single non-zero, non-undef, element.
05685   if (NumNonZero == 1) {
05686     unsigned Idx = countTrailingZeros(NonZeros);
05687     SDValue Item = Op.getOperand(Idx);
05688 
05689     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05690     // the value are obviously zero, truncate the value to i32 and do the
05691     // insertion that way.  Only do this if the value is non-constant or if the
05692     // value is a constant being inserted into element 0.  It is cheaper to do
05693     // a constant pool load than it is to do a movd + shuffle.
05694     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05695         (!IsAllConstants || Idx == 0)) {
05696       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05697         // Handle SSE only.
05698         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05699         EVT VecVT = MVT::v4i32;
05700 
05701         // Truncate the value (which may itself be a constant) to i32, and
05702         // convert it to a vector with movd (S2V+shuffle to zero extend).
05703         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05704         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05705         return DAG.getNode(
05706             ISD::BITCAST, dl, VT,
05707             getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
05708       }
05709     }
05710 
05711     // If we have a constant or non-constant insertion into the low element of
05712     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
05713     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
05714     // depending on what the source datatype is.
05715     if (Idx == 0) {
05716       if (NumZero == 0)
05717         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05718 
05719       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
05720           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
05721         if (VT.is512BitVector()) {
05722           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
05723           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
05724                              Item, DAG.getIntPtrConstant(0));
05725         }
05726         assert((VT.is128BitVector() || VT.is256BitVector()) &&
05727                "Expected an SSE value type!");
05728         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05729         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
05730         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05731       }
05732 
05733       // We can't directly insert an i8 or i16 into a vector, so zero extend
05734       // it to i32 first.
05735       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
05736         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
05737         if (VT.is256BitVector()) {
05738           if (Subtarget->hasAVX()) {
05739             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v8i32, Item);
05740             Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05741           } else {
05742             // Without AVX, we need to extend to a 128-bit vector and then
05743             // insert into the 256-bit vector.
05744             Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
05745             SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
05746             Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
05747           }
05748         } else {
05749           assert(VT.is128BitVector() && "Expected an SSE value type!");
05750           Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
05751           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05752         }
05753         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05754       }
05755     }
05756 
05757     // Is it a vector logical left shift?
05758     if (NumElems == 2 && Idx == 1 &&
05759         X86::isZeroNode(Op.getOperand(0)) &&
05760         !X86::isZeroNode(Op.getOperand(1))) {
05761       unsigned NumBits = VT.getSizeInBits();
05762       return getVShift(true, VT,
05763                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
05764                                    VT, Op.getOperand(1)),
05765                        NumBits/2, DAG, *this, dl);
05766     }
05767 
05768     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
05769       return SDValue();
05770 
05771     // Otherwise, if this is a vector with i32 or f32 elements, and the element
05772     // is a non-constant being inserted into an element other than the low one,
05773     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
05774     // movd/movss) to move this into the low element, then shuffle it into
05775     // place.
05776     if (EVTBits == 32) {
05777       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05778       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
05779     }
05780   }
05781 
05782   // Splat is obviously ok. Let legalizer expand it to a shuffle.
05783   if (Values.size() == 1) {
05784     if (EVTBits == 32) {
05785       // Instead of a shuffle like this:
05786       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
05787       // Check if it's possible to issue this instead.
05788       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
05789       unsigned Idx = countTrailingZeros(NonZeros);
05790       SDValue Item = Op.getOperand(Idx);
05791       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
05792         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
05793     }
05794     return SDValue();
05795   }
05796 
05797   // A vector full of immediates; various special cases are already
05798   // handled, so this is best done with a single constant-pool load.
05799   if (IsAllConstants)
05800     return SDValue();
05801 
05802   // For AVX-length vectors, see if we can use a vector load to get all of the
05803   // elements, otherwise build the individual 128-bit pieces and use
05804   // shuffles to put them in place.
05805   if (VT.is256BitVector() || VT.is512BitVector()) {
05806     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
05807 
05808     // Check for a build vector of consecutive loads.
05809     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
05810       return LD;
05811 
05812     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
05813 
05814     // Build both the lower and upper subvector.
05815     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05816                                 makeArrayRef(&V[0], NumElems/2));
05817     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05818                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
05819 
05820     // Recreate the wider vector with the lower and upper part.
05821     if (VT.is256BitVector())
05822       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05823     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05824   }
05825 
05826   // Let legalizer expand 2-wide build_vectors.
05827   if (EVTBits == 64) {
05828     if (NumNonZero == 1) {
05829       // One half is zero or undef.
05830       unsigned Idx = countTrailingZeros(NonZeros);
05831       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
05832                                  Op.getOperand(Idx));
05833       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
05834     }
05835     return SDValue();
05836   }
05837 
05838   // If element VT is < 32 bits, convert it to inserts into a zero vector.
05839   if (EVTBits == 8 && NumElems == 16)
05840     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
05841                                         Subtarget, *this))
05842       return V;
05843 
05844   if (EVTBits == 16 && NumElems == 8)
05845     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
05846                                       Subtarget, *this))
05847       return V;
05848 
05849   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
05850   if (EVTBits == 32 && NumElems == 4)
05851     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
05852       return V;
05853 
05854   // If element VT is == 32 bits, turn it into a number of shuffles.
05855   SmallVector<SDValue, 8> V(NumElems);
05856   if (NumElems == 4 && NumZero > 0) {
05857     for (unsigned i = 0; i < 4; ++i) {
05858       bool isZero = !(NonZeros & (1 << i));
05859       if (isZero)
05860         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
05861       else
05862         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05863     }
05864 
05865     for (unsigned i = 0; i < 2; ++i) {
05866       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
05867         default: break;
05868         case 0:
05869           V[i] = V[i*2];  // Must be a zero vector.
05870           break;
05871         case 1:
05872           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
05873           break;
05874         case 2:
05875           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
05876           break;
05877         case 3:
05878           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
05879           break;
05880       }
05881     }
05882 
05883     bool Reverse1 = (NonZeros & 0x3) == 2;
05884     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
05885     int MaskVec[] = {
05886       Reverse1 ? 1 : 0,
05887       Reverse1 ? 0 : 1,
05888       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
05889       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
05890     };
05891     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
05892   }
05893 
05894   if (Values.size() > 1 && VT.is128BitVector()) {
05895     // Check for a build vector of consecutive loads.
05896     for (unsigned i = 0; i < NumElems; ++i)
05897       V[i] = Op.getOperand(i);
05898 
05899     // Check for elements which are consecutive loads.
05900     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
05901       return LD;
05902 
05903     // Check for a build vector from mostly shuffle plus few inserting.
05904     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
05905       return Sh;
05906 
05907     // For SSE 4.1, use insertps to put the high elements into the low element.
05908     if (Subtarget->hasSSE41()) {
05909       SDValue Result;
05910       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
05911         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
05912       else
05913         Result = DAG.getUNDEF(VT);
05914 
05915       for (unsigned i = 1; i < NumElems; ++i) {
05916         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
05917         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
05918                              Op.getOperand(i), DAG.getIntPtrConstant(i));
05919       }
05920       return Result;
05921     }
05922 
05923     // Otherwise, expand into a number of unpckl*, start by extending each of
05924     // our (non-undef) elements to the full vector width with the element in the
05925     // bottom slot of the vector (which generates no code for SSE).
05926     for (unsigned i = 0; i < NumElems; ++i) {
05927       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
05928         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05929       else
05930         V[i] = DAG.getUNDEF(VT);
05931     }
05932 
05933     // Next, we iteratively mix elements, e.g. for v4f32:
05934     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
05935     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
05936     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
05937     unsigned EltStride = NumElems >> 1;
05938     while (EltStride != 0) {
05939       for (unsigned i = 0; i < EltStride; ++i) {
05940         // If V[i+EltStride] is undef and this is the first round of mixing,
05941         // then it is safe to just drop this shuffle: V[i] is already in the
05942         // right place, the one element (since it's the first round) being
05943         // inserted as undef can be dropped.  This isn't safe for successive
05944         // rounds because they will permute elements within both vectors.
05945         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
05946             EltStride == NumElems/2)
05947           continue;
05948 
05949         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
05950       }
05951       EltStride >>= 1;
05952     }
05953     return V[0];
05954   }
05955   return SDValue();
05956 }
05957 
05958 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
05959 // to create 256-bit vectors from two other 128-bit ones.
05960 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
05961   SDLoc dl(Op);
05962   MVT ResVT = Op.getSimpleValueType();
05963 
05964   assert((ResVT.is256BitVector() ||
05965           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
05966 
05967   SDValue V1 = Op.getOperand(0);
05968   SDValue V2 = Op.getOperand(1);
05969   unsigned NumElems = ResVT.getVectorNumElements();
05970   if (ResVT.is256BitVector())
05971     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05972 
05973   if (Op.getNumOperands() == 4) {
05974     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
05975                                 ResVT.getVectorNumElements()/2);
05976     SDValue V3 = Op.getOperand(2);
05977     SDValue V4 = Op.getOperand(3);
05978     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
05979       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
05980   }
05981   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05982 }
05983 
05984 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
05985                                        const X86Subtarget *Subtarget,
05986                                        SelectionDAG & DAG) {
05987   SDLoc dl(Op);
05988   MVT ResVT = Op.getSimpleValueType();
05989   unsigned NumOfOperands = Op.getNumOperands();
05990 
05991   assert(isPowerOf2_32(NumOfOperands) &&
05992          "Unexpected number of operands in CONCAT_VECTORS");
05993 
05994   if (NumOfOperands > 2) {
05995     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
05996                                   ResVT.getVectorNumElements()/2);
05997     SmallVector<SDValue, 2> Ops;
05998     for (unsigned i = 0; i < NumOfOperands/2; i++)
05999       Ops.push_back(Op.getOperand(i));
06000     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
06001     Ops.clear();
06002     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
06003       Ops.push_back(Op.getOperand(i));
06004     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
06005     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
06006   }
06007 
06008   SDValue V1 = Op.getOperand(0);
06009   SDValue V2 = Op.getOperand(1);
06010   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
06011   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
06012 
06013   if (IsZeroV1 && IsZeroV2)
06014     return getZeroVector(ResVT, Subtarget, DAG, dl);
06015 
06016   SDValue ZeroIdx = DAG.getIntPtrConstant(0);
06017   SDValue Undef = DAG.getUNDEF(ResVT);
06018   unsigned NumElems = ResVT.getVectorNumElements();
06019   SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
06020 
06021   V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
06022   V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
06023   if (IsZeroV1)
06024     return V2;
06025 
06026   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
06027   // Zero the upper bits of V1
06028   V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
06029   V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
06030   if (IsZeroV2)
06031     return V1;
06032   return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
06033 }
06034 
06035 static SDValue LowerCONCAT_VECTORS(SDValue Op,
06036                                    const X86Subtarget *Subtarget,
06037                                    SelectionDAG &DAG) {
06038   MVT VT = Op.getSimpleValueType();
06039   if (VT.getVectorElementType() == MVT::i1)
06040     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
06041 
06042   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
06043          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
06044           Op.getNumOperands() == 4)));
06045 
06046   // AVX can use the vinsertf128 instruction to create 256-bit vectors
06047   // from two other 128-bit ones.
06048 
06049   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
06050   return LowerAVXCONCAT_VECTORS(Op, DAG);
06051 }
06052 
06053 
06054 //===----------------------------------------------------------------------===//
06055 // Vector shuffle lowering
06056 //
06057 // This is an experimental code path for lowering vector shuffles on x86. It is
06058 // designed to handle arbitrary vector shuffles and blends, gracefully
06059 // degrading performance as necessary. It works hard to recognize idiomatic
06060 // shuffles and lower them to optimal instruction patterns without leaving
06061 // a framework that allows reasonably efficient handling of all vector shuffle
06062 // patterns.
06063 //===----------------------------------------------------------------------===//
06064 
06065 /// \brief Tiny helper function to identify a no-op mask.
06066 ///
06067 /// This is a somewhat boring predicate function. It checks whether the mask
06068 /// array input, which is assumed to be a single-input shuffle mask of the kind
06069 /// used by the X86 shuffle instructions (not a fully general
06070 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
06071 /// in-place shuffle are 'no-op's.
06072 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
06073   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06074     if (Mask[i] != -1 && Mask[i] != i)
06075       return false;
06076   return true;
06077 }
06078 
06079 /// \brief Helper function to classify a mask as a single-input mask.
06080 ///
06081 /// This isn't a generic single-input test because in the vector shuffle
06082 /// lowering we canonicalize single inputs to be the first input operand. This
06083 /// means we can more quickly test for a single input by only checking whether
06084 /// an input from the second operand exists. We also assume that the size of
06085 /// mask corresponds to the size of the input vectors which isn't true in the
06086 /// fully general case.
06087 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
06088   for (int M : Mask)
06089     if (M >= (int)Mask.size())
06090       return false;
06091   return true;
06092 }
06093 
06094 /// \brief Test whether there are elements crossing 128-bit lanes in this
06095 /// shuffle mask.
06096 ///
06097 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
06098 /// and we routinely test for these.
06099 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
06100   int LaneSize = 128 / VT.getScalarSizeInBits();
06101   int Size = Mask.size();
06102   for (int i = 0; i < Size; ++i)
06103     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
06104       return true;
06105   return false;
06106 }
06107 
06108 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
06109 ///
06110 /// This checks a shuffle mask to see if it is performing the same
06111 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
06112 /// that it is also not lane-crossing. It may however involve a blend from the
06113 /// same lane of a second vector.
06114 ///
06115 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
06116 /// non-trivial to compute in the face of undef lanes. The representation is
06117 /// *not* suitable for use with existing 128-bit shuffles as it will contain
06118 /// entries from both V1 and V2 inputs to the wider mask.
06119 static bool
06120 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
06121                                 SmallVectorImpl<int> &RepeatedMask) {
06122   int LaneSize = 128 / VT.getScalarSizeInBits();
06123   RepeatedMask.resize(LaneSize, -1);
06124   int Size = Mask.size();
06125   for (int i = 0; i < Size; ++i) {
06126     if (Mask[i] < 0)
06127       continue;
06128     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
06129       // This entry crosses lanes, so there is no way to model this shuffle.
06130       return false;
06131 
06132     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
06133     if (RepeatedMask[i % LaneSize] == -1)
06134       // This is the first non-undef entry in this slot of a 128-bit lane.
06135       RepeatedMask[i % LaneSize] =
06136           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
06137     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
06138       // Found a mismatch with the repeated mask.
06139       return false;
06140   }
06141   return true;
06142 }
06143 
06144 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
06145 /// arguments.
06146 ///
06147 /// This is a fast way to test a shuffle mask against a fixed pattern:
06148 ///
06149 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
06150 ///
06151 /// It returns true if the mask is exactly as wide as the argument list, and
06152 /// each element of the mask is either -1 (signifying undef) or the value given
06153 /// in the argument.
06154 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
06155                                 ArrayRef<int> ExpectedMask) {
06156   if (Mask.size() != ExpectedMask.size())
06157     return false;
06158 
06159   int Size = Mask.size();
06160 
06161   // If the values are build vectors, we can look through them to find
06162   // equivalent inputs that make the shuffles equivalent.
06163   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
06164   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
06165 
06166   for (int i = 0; i < Size; ++i)
06167     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
06168       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
06169       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
06170       if (!MaskBV || !ExpectedBV ||
06171           MaskBV->getOperand(Mask[i] % Size) !=
06172               ExpectedBV->getOperand(ExpectedMask[i] % Size))
06173         return false;
06174     }
06175 
06176   return true;
06177 }
06178 
06179 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
06180 ///
06181 /// This helper function produces an 8-bit shuffle immediate corresponding to
06182 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
06183 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
06184 /// example.
06185 ///
06186 /// NB: We rely heavily on "undef" masks preserving the input lane.
06187 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
06188                                           SelectionDAG &DAG) {
06189   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
06190   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
06191   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
06192   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
06193   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
06194 
06195   unsigned Imm = 0;
06196   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
06197   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
06198   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
06199   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
06200   return DAG.getConstant(Imm, MVT::i8);
06201 }
06202 
06203 /// \brief Try to emit a blend instruction for a shuffle using bit math.
06204 ///
06205 /// This is used as a fallback approach when first class blend instructions are
06206 /// unavailable. Currently it is only suitable for integer vectors, but could
06207 /// be generalized for floating point vectors if desirable.
06208 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
06209                                             SDValue V2, ArrayRef<int> Mask,
06210                                             SelectionDAG &DAG) {
06211   assert(VT.isInteger() && "Only supports integer vector types!");
06212   MVT EltVT = VT.getScalarType();
06213   int NumEltBits = EltVT.getSizeInBits();
06214   SDValue Zero = DAG.getConstant(0, EltVT);
06215   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
06216   SmallVector<SDValue, 16> MaskOps;
06217   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06218     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
06219       return SDValue(); // Shuffled input!
06220     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
06221   }
06222 
06223   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
06224   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
06225   // We have to cast V2 around.
06226   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
06227   V2 = DAG.getNode(ISD::BITCAST, DL, VT,
06228                    DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
06229                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
06230                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
06231   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
06232 }
06233 
06234 /// \brief Try to emit a blend instruction for a shuffle.
06235 ///
06236 /// This doesn't do any checks for the availability of instructions for blending
06237 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
06238 /// be matched in the backend with the type given. What it does check for is
06239 /// that the shuffle mask is in fact a blend.
06240 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
06241                                          SDValue V2, ArrayRef<int> Mask,
06242                                          const X86Subtarget *Subtarget,
06243                                          SelectionDAG &DAG) {
06244   unsigned BlendMask = 0;
06245   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06246     if (Mask[i] >= Size) {
06247       if (Mask[i] != i + Size)
06248         return SDValue(); // Shuffled V2 input!
06249       BlendMask |= 1u << i;
06250       continue;
06251     }
06252     if (Mask[i] >= 0 && Mask[i] != i)
06253       return SDValue(); // Shuffled V1 input!
06254   }
06255   switch (VT.SimpleTy) {
06256   case MVT::v2f64:
06257   case MVT::v4f32:
06258   case MVT::v4f64:
06259   case MVT::v8f32:
06260     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
06261                        DAG.getConstant(BlendMask, MVT::i8));
06262 
06263   case MVT::v4i64:
06264   case MVT::v8i32:
06265     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06266     // FALLTHROUGH
06267   case MVT::v2i64:
06268   case MVT::v4i32:
06269     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
06270     // that instruction.
06271     if (Subtarget->hasAVX2()) {
06272       // Scale the blend by the number of 32-bit dwords per element.
06273       int Scale =  VT.getScalarSizeInBits() / 32;
06274       BlendMask = 0;
06275       for (int i = 0, Size = Mask.size(); i < Size; ++i)
06276         if (Mask[i] >= Size)
06277           for (int j = 0; j < Scale; ++j)
06278             BlendMask |= 1u << (i * Scale + j);
06279 
06280       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
06281       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06282       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06283       return DAG.getNode(ISD::BITCAST, DL, VT,
06284                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
06285                                      DAG.getConstant(BlendMask, MVT::i8)));
06286     }
06287     // FALLTHROUGH
06288   case MVT::v8i16: {
06289     // For integer shuffles we need to expand the mask and cast the inputs to
06290     // v8i16s prior to blending.
06291     int Scale = 8 / VT.getVectorNumElements();
06292     BlendMask = 0;
06293     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06294       if (Mask[i] >= Size)
06295         for (int j = 0; j < Scale; ++j)
06296           BlendMask |= 1u << (i * Scale + j);
06297 
06298     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
06299     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
06300     return DAG.getNode(ISD::BITCAST, DL, VT,
06301                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
06302                                    DAG.getConstant(BlendMask, MVT::i8)));
06303   }
06304 
06305   case MVT::v16i16: {
06306     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06307     SmallVector<int, 8> RepeatedMask;
06308     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
06309       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
06310       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
06311       BlendMask = 0;
06312       for (int i = 0; i < 8; ++i)
06313         if (RepeatedMask[i] >= 16)
06314           BlendMask |= 1u << i;
06315       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
06316                          DAG.getConstant(BlendMask, MVT::i8));
06317     }
06318   }
06319     // FALLTHROUGH
06320   case MVT::v16i8:
06321   case MVT::v32i8: {
06322     assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
06323            "256-bit byte-blends require AVX2 support!");
06324 
06325     // Scale the blend by the number of bytes per element.
06326     int Scale = VT.getScalarSizeInBits() / 8;
06327 
06328     // This form of blend is always done on bytes. Compute the byte vector
06329     // type.
06330     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
06331 
06332     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
06333     // mix of LLVM's code generator and the x86 backend. We tell the code
06334     // generator that boolean values in the elements of an x86 vector register
06335     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
06336     // mapping a select to operand #1, and 'false' mapping to operand #2. The
06337     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
06338     // of the element (the remaining are ignored) and 0 in that high bit would
06339     // mean operand #1 while 1 in the high bit would mean operand #2. So while
06340     // the LLVM model for boolean values in vector elements gets the relevant
06341     // bit set, it is set backwards and over constrained relative to x86's
06342     // actual model.
06343     SmallVector<SDValue, 32> VSELECTMask;
06344     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06345       for (int j = 0; j < Scale; ++j)
06346         VSELECTMask.push_back(
06347             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
06348                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
06349 
06350     V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06351     V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06352     return DAG.getNode(
06353         ISD::BITCAST, DL, VT,
06354         DAG.getNode(ISD::VSELECT, DL, BlendVT,
06355                     DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
06356                     V1, V2));
06357   }
06358 
06359   default:
06360     llvm_unreachable("Not a supported integer vector type!");
06361   }
06362 }
06363 
06364 /// \brief Try to lower as a blend of elements from two inputs followed by
06365 /// a single-input permutation.
06366 ///
06367 /// This matches the pattern where we can blend elements from two inputs and
06368 /// then reduce the shuffle to a single-input permutation.
06369 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
06370                                                    SDValue V2,
06371                                                    ArrayRef<int> Mask,
06372                                                    SelectionDAG &DAG) {
06373   // We build up the blend mask while checking whether a blend is a viable way
06374   // to reduce the shuffle.
06375   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06376   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
06377 
06378   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06379     if (Mask[i] < 0)
06380       continue;
06381 
06382     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
06383 
06384     if (BlendMask[Mask[i] % Size] == -1)
06385       BlendMask[Mask[i] % Size] = Mask[i];
06386     else if (BlendMask[Mask[i] % Size] != Mask[i])
06387       return SDValue(); // Can't blend in the needed input!
06388 
06389     PermuteMask[i] = Mask[i] % Size;
06390   }
06391 
06392   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06393   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
06394 }
06395 
06396 /// \brief Generic routine to decompose a shuffle and blend into indepndent
06397 /// blends and permutes.
06398 ///
06399 /// This matches the extremely common pattern for handling combined
06400 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
06401 /// operations. It will try to pick the best arrangement of shuffles and
06402 /// blends.
06403 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
06404                                                           SDValue V1,
06405                                                           SDValue V2,
06406                                                           ArrayRef<int> Mask,
06407                                                           SelectionDAG &DAG) {
06408   // Shuffle the input elements into the desired positions in V1 and V2 and
06409   // blend them together.
06410   SmallVector<int, 32> V1Mask(Mask.size(), -1);
06411   SmallVector<int, 32> V2Mask(Mask.size(), -1);
06412   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06413   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06414     if (Mask[i] >= 0 && Mask[i] < Size) {
06415       V1Mask[i] = Mask[i];
06416       BlendMask[i] = i;
06417     } else if (Mask[i] >= Size) {
06418       V2Mask[i] = Mask[i] - Size;
06419       BlendMask[i] = i + Size;
06420     }
06421 
06422   // Try to lower with the simpler initial blend strategy unless one of the
06423   // input shuffles would be a no-op. We prefer to shuffle inputs as the
06424   // shuffle may be able to fold with a load or other benefit. However, when
06425   // we'll have to do 2x as many shuffles in order to achieve this, blending
06426   // first is a better strategy.
06427   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
06428     if (SDValue BlendPerm =
06429             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
06430       return BlendPerm;
06431 
06432   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
06433   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
06434   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06435 }
06436 
06437 /// \brief Try to lower a vector shuffle as a byte rotation.
06438 ///
06439 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
06440 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
06441 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
06442 /// try to generically lower a vector shuffle through such an pattern. It
06443 /// does not check for the profitability of lowering either as PALIGNR or
06444 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
06445 /// This matches shuffle vectors that look like:
06446 ///
06447 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
06448 ///
06449 /// Essentially it concatenates V1 and V2, shifts right by some number of
06450 /// elements, and takes the low elements as the result. Note that while this is
06451 /// specified as a *right shift* because x86 is little-endian, it is a *left
06452 /// rotate* of the vector lanes.
06453 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
06454                                               SDValue V2,
06455                                               ArrayRef<int> Mask,
06456                                               const X86Subtarget *Subtarget,
06457                                               SelectionDAG &DAG) {
06458   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
06459 
06460   int NumElts = Mask.size();
06461   int NumLanes = VT.getSizeInBits() / 128;
06462   int NumLaneElts = NumElts / NumLanes;
06463 
06464   // We need to detect various ways of spelling a rotation:
06465   //   [11, 12, 13, 14, 15,  0,  1,  2]
06466   //   [-1, 12, 13, 14, -1, -1,  1, -1]
06467   //   [-1, -1, -1, -1, -1, -1,  1,  2]
06468   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
06469   //   [-1,  4,  5,  6, -1, -1,  9, -1]
06470   //   [-1,  4,  5,  6, -1, -1, -1, -1]
06471   int Rotation = 0;
06472   SDValue Lo, Hi;
06473   for (int l = 0; l < NumElts; l += NumLaneElts) {
06474     for (int i = 0; i < NumLaneElts; ++i) {
06475       if (Mask[l + i] == -1)
06476         continue;
06477       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
06478 
06479       // Get the mod-Size index and lane correct it.
06480       int LaneIdx = (Mask[l + i] % NumElts) - l;
06481       // Make sure it was in this lane.
06482       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
06483         return SDValue();
06484 
06485       // Determine where a rotated vector would have started.
06486       int StartIdx = i - LaneIdx;
06487       if (StartIdx == 0)
06488         // The identity rotation isn't interesting, stop.
06489         return SDValue();
06490 
06491       // If we found the tail of a vector the rotation must be the missing
06492       // front. If we found the head of a vector, it must be how much of the
06493       // head.
06494       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
06495 
06496       if (Rotation == 0)
06497         Rotation = CandidateRotation;
06498       else if (Rotation != CandidateRotation)
06499         // The rotations don't match, so we can't match this mask.
06500         return SDValue();
06501 
06502       // Compute which value this mask is pointing at.
06503       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
06504 
06505       // Compute which of the two target values this index should be assigned
06506       // to. This reflects whether the high elements are remaining or the low
06507       // elements are remaining.
06508       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
06509 
06510       // Either set up this value if we've not encountered it before, or check
06511       // that it remains consistent.
06512       if (!TargetV)
06513         TargetV = MaskV;
06514       else if (TargetV != MaskV)
06515         // This may be a rotation, but it pulls from the inputs in some
06516         // unsupported interleaving.
06517         return SDValue();
06518     }
06519   }
06520 
06521   // Check that we successfully analyzed the mask, and normalize the results.
06522   assert(Rotation != 0 && "Failed to locate a viable rotation!");
06523   assert((Lo || Hi) && "Failed to find a rotated input vector!");
06524   if (!Lo)
06525     Lo = Hi;
06526   else if (!Hi)
06527     Hi = Lo;
06528 
06529   // The actual rotate instruction rotates bytes, so we need to scale the
06530   // rotation based on how many bytes are in the vector lane.
06531   int Scale = 16 / NumLaneElts;
06532 
06533   // SSSE3 targets can use the palignr instruction.
06534   if (Subtarget->hasSSSE3()) {
06535     // Cast the inputs to i8 vector of correct length to match PALIGNR.
06536     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
06537     Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
06538     Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
06539 
06540     return DAG.getNode(ISD::BITCAST, DL, VT,
06541                        DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
06542                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
06543   }
06544 
06545   assert(VT.getSizeInBits() == 128 &&
06546          "Rotate-based lowering only supports 128-bit lowering!");
06547   assert(Mask.size() <= 16 &&
06548          "Can shuffle at most 16 bytes in a 128-bit vector!");
06549 
06550   // Default SSE2 implementation
06551   int LoByteShift = 16 - Rotation * Scale;
06552   int HiByteShift = Rotation * Scale;
06553 
06554   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
06555   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
06556   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
06557 
06558   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
06559                                 DAG.getConstant(LoByteShift, MVT::i8));
06560   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
06561                                 DAG.getConstant(HiByteShift, MVT::i8));
06562   return DAG.getNode(ISD::BITCAST, DL, VT,
06563                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
06564 }
06565 
06566 /// \brief Compute whether each element of a shuffle is zeroable.
06567 ///
06568 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
06569 /// Either it is an undef element in the shuffle mask, the element of the input
06570 /// referenced is undef, or the element of the input referenced is known to be
06571 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
06572 /// as many lanes with this technique as possible to simplify the remaining
06573 /// shuffle.
06574 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
06575                                                      SDValue V1, SDValue V2) {
06576   SmallBitVector Zeroable(Mask.size(), false);
06577 
06578   while (V1.getOpcode() == ISD::BITCAST)
06579     V1 = V1->getOperand(0);
06580   while (V2.getOpcode() == ISD::BITCAST)
06581     V2 = V2->getOperand(0);
06582 
06583   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
06584   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
06585 
06586   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06587     int M = Mask[i];
06588     // Handle the easy cases.
06589     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
06590       Zeroable[i] = true;
06591       continue;
06592     }
06593 
06594     // If this is an index into a build_vector node (which has the same number
06595     // of elements), dig out the input value and use it.
06596     SDValue V = M < Size ? V1 : V2;
06597     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
06598       continue;
06599 
06600     SDValue Input = V.getOperand(M % Size);
06601     // The UNDEF opcode check really should be dead code here, but not quite
06602     // worth asserting on (it isn't invalid, just unexpected).
06603     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
06604       Zeroable[i] = true;
06605   }
06606 
06607   return Zeroable;
06608 }
06609 
06610 /// \brief Try to emit a bitmask instruction for a shuffle.
06611 ///
06612 /// This handles cases where we can model a blend exactly as a bitmask due to
06613 /// one of the inputs being zeroable.
06614 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
06615                                            SDValue V2, ArrayRef<int> Mask,
06616                                            SelectionDAG &DAG) {
06617   MVT EltVT = VT.getScalarType();
06618   int NumEltBits = EltVT.getSizeInBits();
06619   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
06620   SDValue Zero = DAG.getConstant(0, IntEltVT);
06621   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
06622   if (EltVT.isFloatingPoint()) {
06623     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
06624     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
06625   }
06626   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
06627   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06628   SDValue V;
06629   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06630     if (Zeroable[i])
06631       continue;
06632     if (Mask[i] % Size != i)
06633       return SDValue(); // Not a blend.
06634     if (!V)
06635       V = Mask[i] < Size ? V1 : V2;
06636     else if (V != (Mask[i] < Size ? V1 : V2))
06637       return SDValue(); // Can only let one input through the mask.
06638 
06639     VMaskOps[i] = AllOnes;
06640   }
06641   if (!V)
06642     return SDValue(); // No non-zeroable elements!
06643 
06644   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
06645   V = DAG.getNode(VT.isFloatingPoint()
06646                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
06647                   DL, VT, V, VMask);
06648   return V;
06649 }
06650 
06651 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
06652 ///
06653 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
06654 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
06655 /// matches elements from one of the input vectors shuffled to the left or
06656 /// right with zeroable elements 'shifted in'. It handles both the strictly
06657 /// bit-wise element shifts and the byte shift across an entire 128-bit double
06658 /// quad word lane.
06659 ///
06660 /// PSHL : (little-endian) left bit shift.
06661 /// [ zz, 0, zz,  2 ]
06662 /// [ -1, 4, zz, -1 ]
06663 /// PSRL : (little-endian) right bit shift.
06664 /// [  1, zz,  3, zz]
06665 /// [ -1, -1,  7, zz]
06666 /// PSLLDQ : (little-endian) left byte shift
06667 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
06668 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
06669 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
06670 /// PSRLDQ : (little-endian) right byte shift
06671 /// [  5, 6,  7, zz, zz, zz, zz, zz]
06672 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
06673 /// [  1, 2, -1, -1, -1, -1, zz, zz]
06674 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
06675                                          SDValue V2, ArrayRef<int> Mask,
06676                                          SelectionDAG &DAG) {
06677   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06678 
06679   int Size = Mask.size();
06680   assert(Size