LLVM  mainline
X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86FrameLowering.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86MachineFunctionInfo.h"
00021 #include "X86TargetMachine.h"
00022 #include "X86TargetObjectFile.h"
00023 #include "llvm/ADT/SmallBitVector.h"
00024 #include "llvm/ADT/SmallSet.h"
00025 #include "llvm/ADT/Statistic.h"
00026 #include "llvm/ADT/StringExtras.h"
00027 #include "llvm/ADT/StringSwitch.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/IR/CallSite.h"
00036 #include "llvm/IR/CallingConv.h"
00037 #include "llvm/IR/Constants.h"
00038 #include "llvm/IR/DerivedTypes.h"
00039 #include "llvm/IR/Function.h"
00040 #include "llvm/IR/GlobalAlias.h"
00041 #include "llvm/IR/GlobalVariable.h"
00042 #include "llvm/IR/Instructions.h"
00043 #include "llvm/IR/Intrinsics.h"
00044 #include "llvm/MC/MCAsmInfo.h"
00045 #include "llvm/MC/MCContext.h"
00046 #include "llvm/MC/MCExpr.h"
00047 #include "llvm/MC/MCSymbol.h"
00048 #include "llvm/Support/CommandLine.h"
00049 #include "llvm/Support/Debug.h"
00050 #include "llvm/Support/ErrorHandling.h"
00051 #include "llvm/Support/MathExtras.h"
00052 #include "llvm/Target/TargetOptions.h"
00053 #include "X86IntrinsicsInfo.h"
00054 #include <bitset>
00055 #include <numeric>
00056 #include <cctype>
00057 using namespace llvm;
00058 
00059 #define DEBUG_TYPE "x86-isel"
00060 
00061 STATISTIC(NumTailCalls, "Number of tail calls");
00062 
00063 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00064     "x86-experimental-vector-widening-legalization", cl::init(false),
00065     cl::desc("Enable an experimental vector type legalization through widening "
00066              "rather than promotion."),
00067     cl::Hidden);
00068 
00069 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00070     "x86-recip-refinement-steps", cl::init(1),
00071     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00072              "result of the hardware reciprocal estimate instruction."),
00073     cl::NotHidden);
00074 
00075 // Forward declarations.
00076 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00077                        SDValue V2);
00078 
00079 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
00080                                      const X86Subtarget &STI)
00081     : TargetLowering(TM), Subtarget(&STI) {
00082   X86ScalarSSEf64 = Subtarget->hasSSE2();
00083   X86ScalarSSEf32 = Subtarget->hasSSE1();
00084   TD = getDataLayout();
00085 
00086   // Set up the TargetLowering object.
00087   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00088 
00089   // X86 is weird. It always uses i8 for shift amounts and setcc results.
00090   setBooleanContents(ZeroOrOneBooleanContent);
00091   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00092   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00093 
00094   // For 64-bit, since we have so many registers, use the ILP scheduler.
00095   // For 32-bit, use the register pressure specific scheduling.
00096   // For Atom, always use ILP scheduling.
00097   if (Subtarget->isAtom())
00098     setSchedulingPreference(Sched::ILP);
00099   else if (Subtarget->is64Bit())
00100     setSchedulingPreference(Sched::ILP);
00101   else
00102     setSchedulingPreference(Sched::RegPressure);
00103   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
00104   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00105 
00106   // Bypass expensive divides on Atom when compiling with O2.
00107   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00108     if (Subtarget->hasSlowDivide32())
00109       addBypassSlowDiv(32, 8);
00110     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00111       addBypassSlowDiv(64, 16);
00112   }
00113 
00114   if (Subtarget->isTargetKnownWindowsMSVC()) {
00115     // Setup Windows compiler runtime calls.
00116     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00117     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00118     setLibcallName(RTLIB::SREM_I64, "_allrem");
00119     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00120     setLibcallName(RTLIB::MUL_I64, "_allmul");
00121     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00122     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00123     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00124     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00125     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00126 
00127     // The _ftol2 runtime function has an unusual calling conv, which
00128     // is modeled by a special pseudo-instruction.
00129     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00130     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00131     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00132     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00133   }
00134 
00135   if (Subtarget->isTargetDarwin()) {
00136     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00137     setUseUnderscoreSetJmp(false);
00138     setUseUnderscoreLongJmp(false);
00139   } else if (Subtarget->isTargetWindowsGNU()) {
00140     // MS runtime is weird: it exports _setjmp, but longjmp!
00141     setUseUnderscoreSetJmp(true);
00142     setUseUnderscoreLongJmp(false);
00143   } else {
00144     setUseUnderscoreSetJmp(true);
00145     setUseUnderscoreLongJmp(true);
00146   }
00147 
00148   // Set up the register classes.
00149   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00150   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00151   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00152   if (Subtarget->is64Bit())
00153     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00154 
00155   for (MVT VT : MVT::integer_valuetypes())
00156     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
00157 
00158   // We don't accept any truncstore of integer registers.
00159   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00160   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00161   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00162   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00163   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00164   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00165 
00166   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00167 
00168   // SETOEQ and SETUNE require checking two conditions.
00169   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00170   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00171   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00172   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00173   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00174   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00175 
00176   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00177   // operation.
00178   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00179   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00180   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00181 
00182   if (Subtarget->is64Bit()) {
00183     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00184     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00185   } else if (!TM.Options.UseSoftFloat) {
00186     // We have an algorithm for SSE2->double, and we turn this into a
00187     // 64-bit FILD followed by conditional FADD for other targets.
00188     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00189     // We have an algorithm for SSE2, and we turn this into a 64-bit
00190     // FILD for other targets.
00191     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00192   }
00193 
00194   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00195   // this operation.
00196   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00197   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00198 
00199   if (!TM.Options.UseSoftFloat) {
00200     // SSE has no i16 to fp conversion, only i32
00201     if (X86ScalarSSEf32) {
00202       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00203       // f32 and f64 cases are Legal, f80 case is not
00204       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00205     } else {
00206       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00207       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00208     }
00209   } else {
00210     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00211     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00212   }
00213 
00214   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00215   // are Legal, f80 is custom lowered.
00216   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00217   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00218 
00219   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00220   // this operation.
00221   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00222   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00223 
00224   if (X86ScalarSSEf32) {
00225     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00226     // f32 and f64 cases are Legal, f80 case is not
00227     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00228   } else {
00229     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00230     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00231   }
00232 
00233   // Handle FP_TO_UINT by promoting the destination to a larger signed
00234   // conversion.
00235   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00236   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00237   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00238 
00239   if (Subtarget->is64Bit()) {
00240     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00241     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00242   } else if (!TM.Options.UseSoftFloat) {
00243     // Since AVX is a superset of SSE3, only check for SSE here.
00244     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00245       // Expand FP_TO_UINT into a select.
00246       // FIXME: We would like to use a Custom expander here eventually to do
00247       // the optimal thing for SSE vs. the default expansion in the legalizer.
00248       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00249     else
00250       // With SSE3 we can use fisttpll to convert to a signed i64; without
00251       // SSE, we're stuck with a fistpll.
00252       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00253   }
00254 
00255   if (isTargetFTOL()) {
00256     // Use the _ftol2 runtime function, which has a pseudo-instruction
00257     // to handle its weird calling convention.
00258     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00259   }
00260 
00261   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00262   if (!X86ScalarSSEf64) {
00263     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00264     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00265     if (Subtarget->is64Bit()) {
00266       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00267       // Without SSE, i64->f64 goes through memory.
00268       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00269     }
00270   }
00271 
00272   // Scalar integer divide and remainder are lowered to use operations that
00273   // produce two results, to match the available instructions. This exposes
00274   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00275   // into a single instruction.
00276   //
00277   // Scalar integer multiply-high is also lowered to use two-result
00278   // operations, to match the available instructions. However, plain multiply
00279   // (low) operations are left as Legal, as there are single-result
00280   // instructions for this in x86. Using the two-result multiply instructions
00281   // when both high and low results are needed must be arranged by dagcombine.
00282   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00283     MVT VT = IntVTs[i];
00284     setOperationAction(ISD::MULHS, VT, Expand);
00285     setOperationAction(ISD::MULHU, VT, Expand);
00286     setOperationAction(ISD::SDIV, VT, Expand);
00287     setOperationAction(ISD::UDIV, VT, Expand);
00288     setOperationAction(ISD::SREM, VT, Expand);
00289     setOperationAction(ISD::UREM, VT, Expand);
00290 
00291     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00292     setOperationAction(ISD::ADDC, VT, Custom);
00293     setOperationAction(ISD::ADDE, VT, Custom);
00294     setOperationAction(ISD::SUBC, VT, Custom);
00295     setOperationAction(ISD::SUBE, VT, Custom);
00296   }
00297 
00298   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00299   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00300   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00301   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00302   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00303   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00304   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00305   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00306   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00307   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00308   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00309   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00310   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00311   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00312   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00313   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00314   if (Subtarget->is64Bit())
00315     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00316   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00317   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00318   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00319   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00320   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00321   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00322   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00323   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00324 
00325   // Promote the i8 variants and force them on up to i32 which has a shorter
00326   // encoding.
00327   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00328   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00329   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00330   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00331   if (Subtarget->hasBMI()) {
00332     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00333     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00334     if (Subtarget->is64Bit())
00335       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00336   } else {
00337     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00338     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00339     if (Subtarget->is64Bit())
00340       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00341   }
00342 
00343   if (Subtarget->hasLZCNT()) {
00344     // When promoting the i8 variants, force them to i32 for a shorter
00345     // encoding.
00346     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00347     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00348     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00349     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00350     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00351     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00352     if (Subtarget->is64Bit())
00353       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00354   } else {
00355     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00356     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00357     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00358     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00359     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00360     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00361     if (Subtarget->is64Bit()) {
00362       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00363       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00364     }
00365   }
00366 
00367   // Special handling for half-precision floating point conversions.
00368   // If we don't have F16C support, then lower half float conversions
00369   // into library calls.
00370   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00371     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00372     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00373   }
00374 
00375   // There's never any support for operations beyond MVT::f32.
00376   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00377   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00378   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00379   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00380 
00381   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
00382   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
00383   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
00384   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00385   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00386   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00387 
00388   if (Subtarget->hasPOPCNT()) {
00389     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00390   } else {
00391     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00392     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00393     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00394     if (Subtarget->is64Bit())
00395       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00396   }
00397 
00398   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00399 
00400   if (!Subtarget->hasMOVBE())
00401     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00402 
00403   // These should be promoted to a larger select which is supported.
00404   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00405   // X86 wants to expand cmov itself.
00406   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00407   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00408   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00409   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00410   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00411   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00412   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00413   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00414   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00415   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00416   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00417   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00418   if (Subtarget->is64Bit()) {
00419     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00420     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00421   }
00422   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00423   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00424   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00425   // support continuation, user-level threading, and etc.. As a result, no
00426   // other SjLj exception interfaces are implemented and please don't build
00427   // your own exception handling based on them.
00428   // LLVM/Clang supports zero-cost DWARF exception handling.
00429   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00430   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00431 
00432   // Darwin ABI issue.
00433   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00434   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00435   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00436   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00437   if (Subtarget->is64Bit())
00438     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00439   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00440   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00441   if (Subtarget->is64Bit()) {
00442     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00443     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00444     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00445     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00446     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00447   }
00448   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00449   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00450   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00451   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00452   if (Subtarget->is64Bit()) {
00453     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00454     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00455     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00456   }
00457 
00458   if (Subtarget->hasSSE1())
00459     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00460 
00461   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00462 
00463   // Expand certain atomics
00464   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00465     MVT VT = IntVTs[i];
00466     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00467     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00468     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00469   }
00470 
00471   if (Subtarget->hasCmpxchg16b()) {
00472     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00473   }
00474 
00475   // FIXME - use subtarget debug flags
00476   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00477       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00478     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00479   }
00480 
00481   if (Subtarget->is64Bit()) {
00482     setExceptionPointerRegister(X86::RAX);
00483     setExceptionSelectorRegister(X86::RDX);
00484   } else {
00485     setExceptionPointerRegister(X86::EAX);
00486     setExceptionSelectorRegister(X86::EDX);
00487   }
00488   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00489   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00490 
00491   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00492   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00493 
00494   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00495   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00496 
00497   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00498   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00499   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00500   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00501     // TargetInfo::X86_64ABIBuiltinVaList
00502     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00503     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00504   } else {
00505     // TargetInfo::CharPtrBuiltinVaList
00506     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00507     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00508   }
00509 
00510   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00511   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00512 
00513   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00514 
00515   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00516     // f32 and f64 use SSE.
00517     // Set up the FP register classes.
00518     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00519     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00520 
00521     // Use ANDPD to simulate FABS.
00522     setOperationAction(ISD::FABS , MVT::f64, Custom);
00523     setOperationAction(ISD::FABS , MVT::f32, Custom);
00524 
00525     // Use XORP to simulate FNEG.
00526     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00527     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00528 
00529     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00530     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00531     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00532 
00533     // Lower this to FGETSIGNx86 plus an AND.
00534     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00535     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00536 
00537     // We don't support sin/cos/fmod
00538     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00539     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00540     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00541     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00542     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00543     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00544 
00545     // Expand FP immediates into loads from the stack, except for the special
00546     // cases we handle.
00547     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00548     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00549   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00550     // Use SSE for f32, x87 for f64.
00551     // Set up the FP register classes.
00552     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00553     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00554 
00555     // Use ANDPS to simulate FABS.
00556     setOperationAction(ISD::FABS , MVT::f32, Custom);
00557 
00558     // Use XORP to simulate FNEG.
00559     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00560 
00561     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00562 
00563     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00564     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00565     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00566 
00567     // We don't support sin/cos/fmod
00568     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00569     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00570     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00571 
00572     // Special cases we handle for FP constants.
00573     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00574     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00575     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00576     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00577     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00578 
00579     if (!TM.Options.UnsafeFPMath) {
00580       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00581       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00582       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00583     }
00584   } else if (!TM.Options.UseSoftFloat) {
00585     // f32 and f64 in x87.
00586     // Set up the FP register classes.
00587     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00588     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00589 
00590     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00591     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00592     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00593     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00594 
00595     if (!TM.Options.UnsafeFPMath) {
00596       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00597       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00598       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00599       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00600       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00601       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00602     }
00603     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00604     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00605     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00606     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00607     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00608     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00609     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00610     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00611   }
00612 
00613   // We don't support FMA.
00614   setOperationAction(ISD::FMA, MVT::f64, Expand);
00615   setOperationAction(ISD::FMA, MVT::f32, Expand);
00616 
00617   // Long double always uses X87.
00618   if (!TM.Options.UseSoftFloat) {
00619     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00620     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00621     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00622     {
00623       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00624       addLegalFPImmediate(TmpFlt);  // FLD0
00625       TmpFlt.changeSign();
00626       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00627 
00628       bool ignored;
00629       APFloat TmpFlt2(+1.0);
00630       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00631                       &ignored);
00632       addLegalFPImmediate(TmpFlt2);  // FLD1
00633       TmpFlt2.changeSign();
00634       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00635     }
00636 
00637     if (!TM.Options.UnsafeFPMath) {
00638       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00639       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00640       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00641     }
00642 
00643     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00644     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00645     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00646     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00647     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00648     setOperationAction(ISD::FMA, MVT::f80, Expand);
00649   }
00650 
00651   // Always use a library call for pow.
00652   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00653   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00654   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00655 
00656   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00657   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00658   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00659   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00660   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00661   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00662   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00663 
00664   // First set operation action for all vector types to either promote
00665   // (for widening) or expand (for scalarization). Then we will selectively
00666   // turn on ones that can be effectively codegen'd.
00667   for (MVT VT : MVT::vector_valuetypes()) {
00668     setOperationAction(ISD::ADD , VT, Expand);
00669     setOperationAction(ISD::SUB , VT, Expand);
00670     setOperationAction(ISD::FADD, VT, Expand);
00671     setOperationAction(ISD::FNEG, VT, Expand);
00672     setOperationAction(ISD::FSUB, VT, Expand);
00673     setOperationAction(ISD::MUL , VT, Expand);
00674     setOperationAction(ISD::FMUL, VT, Expand);
00675     setOperationAction(ISD::SDIV, VT, Expand);
00676     setOperationAction(ISD::UDIV, VT, Expand);
00677     setOperationAction(ISD::FDIV, VT, Expand);
00678     setOperationAction(ISD::SREM, VT, Expand);
00679     setOperationAction(ISD::UREM, VT, Expand);
00680     setOperationAction(ISD::LOAD, VT, Expand);
00681     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00682     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00683     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00684     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00685     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00686     setOperationAction(ISD::FABS, VT, Expand);
00687     setOperationAction(ISD::FSIN, VT, Expand);
00688     setOperationAction(ISD::FSINCOS, VT, Expand);
00689     setOperationAction(ISD::FCOS, VT, Expand);
00690     setOperationAction(ISD::FSINCOS, VT, Expand);
00691     setOperationAction(ISD::FREM, VT, Expand);
00692     setOperationAction(ISD::FMA,  VT, Expand);
00693     setOperationAction(ISD::FPOWI, VT, Expand);
00694     setOperationAction(ISD::FSQRT, VT, Expand);
00695     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00696     setOperationAction(ISD::FFLOOR, VT, Expand);
00697     setOperationAction(ISD::FCEIL, VT, Expand);
00698     setOperationAction(ISD::FTRUNC, VT, Expand);
00699     setOperationAction(ISD::FRINT, VT, Expand);
00700     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00701     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00702     setOperationAction(ISD::MULHS, VT, Expand);
00703     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00704     setOperationAction(ISD::MULHU, VT, Expand);
00705     setOperationAction(ISD::SDIVREM, VT, Expand);
00706     setOperationAction(ISD::UDIVREM, VT, Expand);
00707     setOperationAction(ISD::FPOW, VT, Expand);
00708     setOperationAction(ISD::CTPOP, VT, Expand);
00709     setOperationAction(ISD::CTTZ, VT, Expand);
00710     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00711     setOperationAction(ISD::CTLZ, VT, Expand);
00712     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00713     setOperationAction(ISD::SHL, VT, Expand);
00714     setOperationAction(ISD::SRA, VT, Expand);
00715     setOperationAction(ISD::SRL, VT, Expand);
00716     setOperationAction(ISD::ROTL, VT, Expand);
00717     setOperationAction(ISD::ROTR, VT, Expand);
00718     setOperationAction(ISD::BSWAP, VT, Expand);
00719     setOperationAction(ISD::SETCC, VT, Expand);
00720     setOperationAction(ISD::FLOG, VT, Expand);
00721     setOperationAction(ISD::FLOG2, VT, Expand);
00722     setOperationAction(ISD::FLOG10, VT, Expand);
00723     setOperationAction(ISD::FEXP, VT, Expand);
00724     setOperationAction(ISD::FEXP2, VT, Expand);
00725     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00726     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00727     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00728     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00729     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00730     setOperationAction(ISD::TRUNCATE, VT, Expand);
00731     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00732     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00733     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00734     setOperationAction(ISD::VSELECT, VT, Expand);
00735     setOperationAction(ISD::SELECT_CC, VT, Expand);
00736     for (MVT InnerVT : MVT::vector_valuetypes()) {
00737       setTruncStoreAction(InnerVT, VT, Expand);
00738 
00739       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
00740       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
00741 
00742       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
00743       // types, we have to deal with them whether we ask for Expansion or not.
00744       // Setting Expand causes its own optimisation problems though, so leave
00745       // them legal.
00746       if (VT.getVectorElementType() == MVT::i1)
00747         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
00748     }
00749   }
00750 
00751   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00752   // with -msoft-float, disable use of MMX as well.
00753   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00754     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00755     // No operations on x86mmx supported, everything uses intrinsics.
00756   }
00757 
00758   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00759   // into smaller operations.
00760   for (MVT MMXTy : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v1i64}) {
00761     setOperationAction(ISD::MULHS,              MMXTy,      Expand);
00762     setOperationAction(ISD::AND,                MMXTy,      Expand);
00763     setOperationAction(ISD::OR,                 MMXTy,      Expand);
00764     setOperationAction(ISD::XOR,                MMXTy,      Expand);
00765     setOperationAction(ISD::SCALAR_TO_VECTOR,   MMXTy,      Expand);
00766     setOperationAction(ISD::SELECT,             MMXTy,      Expand);
00767     setOperationAction(ISD::BITCAST,            MMXTy,      Expand);
00768   }
00769   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00770 
00771   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00772     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00773 
00774     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00775     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00776     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00777     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00778     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00779     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00780     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00781     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00782     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00783     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00784     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
00785     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00786     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00787     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00788   }
00789 
00790   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00791     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00792 
00793     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00794     // registers cannot be used even for integer operations.
00795     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00796     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00797     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00798     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00799 
00800     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00801     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00802     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00803     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00804     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00805     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00806     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00807     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00808     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00809     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00810     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00811     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00812     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00813     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00814     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00815     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00816     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00817     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00818     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00819     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00820     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00821     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00822 
00823     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00824     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00825     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00826     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00827 
00828     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00829     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00830     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00831     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00832     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00833 
00834     // Only provide customized ctpop vector bit twiddling for vector types we
00835     // know to perform better than using the popcnt instructions on each vector
00836     // element. If popcnt isn't supported, always provide the custom version.
00837     if (!Subtarget->hasPOPCNT()) {
00838       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
00839       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
00840     }
00841 
00842     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00843     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00844       MVT VT = (MVT::SimpleValueType)i;
00845       // Do not attempt to custom lower non-power-of-2 vectors
00846       if (!isPowerOf2_32(VT.getVectorNumElements()))
00847         continue;
00848       // Do not attempt to custom lower non-128-bit vectors
00849       if (!VT.is128BitVector())
00850         continue;
00851       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00852       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00853       setOperationAction(ISD::VSELECT,            VT, Custom);
00854       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00855     }
00856 
00857     // We support custom legalizing of sext and anyext loads for specific
00858     // memory vector types which we can load as a scalar (or sequence of
00859     // scalars) and extend in-register to a legal 128-bit vector type. For sext
00860     // loads these must work with a single scalar load.
00861     for (MVT VT : MVT::integer_vector_valuetypes()) {
00862       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
00863       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
00864       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
00865       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
00866       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
00867       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
00868       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
00869       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
00870       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
00871     }
00872 
00873     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
00874     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
00875     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
00876     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
00877     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
00878     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
00879     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
00880     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
00881 
00882     if (Subtarget->is64Bit()) {
00883       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00884       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00885     }
00886 
00887     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
00888     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00889       MVT VT = (MVT::SimpleValueType)i;
00890 
00891       // Do not attempt to promote non-128-bit vectors
00892       if (!VT.is128BitVector())
00893         continue;
00894 
00895       setOperationAction(ISD::AND,    VT, Promote);
00896       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
00897       setOperationAction(ISD::OR,     VT, Promote);
00898       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
00899       setOperationAction(ISD::XOR,    VT, Promote);
00900       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
00901       setOperationAction(ISD::LOAD,   VT, Promote);
00902       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
00903       setOperationAction(ISD::SELECT, VT, Promote);
00904       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
00905     }
00906 
00907     // Custom lower v2i64 and v2f64 selects.
00908     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
00909     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
00910     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
00911     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
00912 
00913     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
00914     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
00915 
00916     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
00917     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
00918     // As there is no 64-bit GPR available, we need build a special custom
00919     // sequence to convert from v2i32 to v2f32.
00920     if (!Subtarget->is64Bit())
00921       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
00922 
00923     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
00924     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
00925 
00926     for (MVT VT : MVT::fp_vector_valuetypes())
00927       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
00928 
00929     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
00930     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
00931     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
00932   }
00933 
00934   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
00935     for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
00936       setOperationAction(ISD::FFLOOR,           RoundedTy,  Legal);
00937       setOperationAction(ISD::FCEIL,            RoundedTy,  Legal);
00938       setOperationAction(ISD::FTRUNC,           RoundedTy,  Legal);
00939       setOperationAction(ISD::FRINT,            RoundedTy,  Legal);
00940       setOperationAction(ISD::FNEARBYINT,       RoundedTy,  Legal);
00941     }
00942 
00943     // FIXME: Do we need to handle scalar-to-vector here?
00944     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
00945 
00946     // We directly match byte blends in the backend as they match the VSELECT
00947     // condition form.
00948     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
00949 
00950     // SSE41 brings specific instructions for doing vector sign extend even in
00951     // cases where we don't have SRA.
00952     for (MVT VT : MVT::integer_vector_valuetypes()) {
00953       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
00954       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
00955       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
00956     }
00957 
00958     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
00959     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
00960     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
00961     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
00962     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
00963     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
00964     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
00965 
00966     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
00967     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
00968     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
00969     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
00970     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
00971     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
00972 
00973     // i8 and i16 vectors are custom because the source register and source
00974     // source memory operand types are not the same width.  f32 vectors are
00975     // custom since the immediate controlling the insert encodes additional
00976     // information.
00977     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
00978     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00979     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00980     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00981 
00982     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
00983     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
00984     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
00985     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00986 
00987     // FIXME: these should be Legal, but that's only for the case where
00988     // the index is constant.  For now custom expand to deal with that.
00989     if (Subtarget->is64Bit()) {
00990       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
00991       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
00992     }
00993   }
00994 
00995   if (Subtarget->hasSSE2()) {
00996     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
00997     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
00998 
00999     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01000     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01001 
01002     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01003     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01004 
01005     // In the customized shift lowering, the legal cases in AVX2 will be
01006     // recognized.
01007     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01008     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01009 
01010     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01011     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01012 
01013     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01014   }
01015 
01016   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01017     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01018     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01019     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01020     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01021     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01022     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01023 
01024     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01025     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01026     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01027 
01028     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01029     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01030     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01031     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01032     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01033     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01034     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01035     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01036     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01037     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01038     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01039     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01040 
01041     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01042     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01043     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01044     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01045     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01046     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01047     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01048     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01049     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01050     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01051     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01052     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01053 
01054     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01055     // even though v8i16 is a legal type.
01056     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01057     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01058     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01059 
01060     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01061     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01062     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01063 
01064     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01065     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01066 
01067     for (MVT VT : MVT::fp_vector_valuetypes())
01068       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
01069 
01070     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01071     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01072 
01073     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01074     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01075 
01076     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01077     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01078 
01079     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01080     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01081     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01082     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01083 
01084     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01085     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01086     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01087 
01088     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01089     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01090     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01091     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01092     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01093     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01094     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01095     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01096     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01097     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01098     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01099     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01100 
01101     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01102       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01103       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01104       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01105       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01106       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01107       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01108     }
01109 
01110     if (Subtarget->hasInt256()) {
01111       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01112       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01113       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01114       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01115 
01116       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01117       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01118       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01119       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01120 
01121       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01122       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01123       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01124       // Don't lower v32i8 because there is no 128-bit byte mul
01125 
01126       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01127       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01128       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01129       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01130 
01131       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01132       // when we have a 256bit-wide blend with immediate.
01133       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01134 
01135       // Only provide customized ctpop vector bit twiddling for vector types we
01136       // know to perform better than using the popcnt instructions on each
01137       // vector element. If popcnt isn't supported, always provide the custom
01138       // version.
01139       if (!Subtarget->hasPOPCNT())
01140         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
01141 
01142       // Custom CTPOP always performs better on natively supported v8i32
01143       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
01144 
01145       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
01146       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01147       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01148       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01149       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01150       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01151       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01152 
01153       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01154       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01155       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01156       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01157       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01158       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01159     } else {
01160       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01161       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01162       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01163       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01164 
01165       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01166       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01167       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01168       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01169 
01170       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01171       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01172       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01173       // Don't lower v32i8 because there is no 128-bit byte mul
01174     }
01175 
01176     // In the customized shift lowering, the legal cases in AVX2 will be
01177     // recognized.
01178     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01179     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01180 
01181     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01182     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01183 
01184     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01185 
01186     // Custom lower several nodes for 256-bit types.
01187     for (MVT VT : MVT::vector_valuetypes()) {
01188       if (VT.getScalarSizeInBits() >= 32) {
01189         setOperationAction(ISD::MLOAD,  VT, Legal);
01190         setOperationAction(ISD::MSTORE, VT, Legal);
01191       }
01192       // Extract subvector is special because the value type
01193       // (result) is 128-bit but the source is 256-bit wide.
01194       if (VT.is128BitVector()) {
01195         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01196       }
01197       // Do not attempt to custom lower other non-256-bit vectors
01198       if (!VT.is256BitVector())
01199         continue;
01200 
01201       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01202       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01203       setOperationAction(ISD::VSELECT,            VT, Custom);
01204       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01205       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01206       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01207       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01208       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01209     }
01210 
01211     if (Subtarget->hasInt256())
01212       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01213 
01214 
01215     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01216     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01217       MVT VT = (MVT::SimpleValueType)i;
01218 
01219       // Do not attempt to promote non-256-bit vectors
01220       if (!VT.is256BitVector())
01221         continue;
01222 
01223       setOperationAction(ISD::AND,    VT, Promote);
01224       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01225       setOperationAction(ISD::OR,     VT, Promote);
01226       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01227       setOperationAction(ISD::XOR,    VT, Promote);
01228       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01229       setOperationAction(ISD::LOAD,   VT, Promote);
01230       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01231       setOperationAction(ISD::SELECT, VT, Promote);
01232       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01233     }
01234   }
01235 
01236   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01237     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01238     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01239     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01240     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01241 
01242     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01243     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01244     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01245 
01246     for (MVT VT : MVT::fp_vector_valuetypes())
01247       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
01248 
01249     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01250     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01251     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01252     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01253     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01254     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01255     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01256     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01257     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01258     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01259 
01260     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01261     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01262     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01263     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01264     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01265     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01266 
01267     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01268     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01269     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01270     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01271     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01272     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01273     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01274     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01275 
01276     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01277     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01278     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01279     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01280     if (Subtarget->is64Bit()) {
01281       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01282       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01283       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01284       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01285     }
01286     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01287     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01288     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01289     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01290     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01291     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01292     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01293     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01294     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01295     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01296     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01297     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01298     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01299     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01300 
01301     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01302     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01303     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01304     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01305     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01306     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01307     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01308     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01309     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01310     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01311     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01312     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01313     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01314 
01315     setOperationAction(ISD::FFLOOR,             MVT::v16f32, Legal);
01316     setOperationAction(ISD::FFLOOR,             MVT::v8f64, Legal);
01317     setOperationAction(ISD::FCEIL,              MVT::v16f32, Legal);
01318     setOperationAction(ISD::FCEIL,              MVT::v8f64, Legal);
01319     setOperationAction(ISD::FTRUNC,             MVT::v16f32, Legal);
01320     setOperationAction(ISD::FTRUNC,             MVT::v8f64, Legal);
01321     setOperationAction(ISD::FRINT,              MVT::v16f32, Legal);
01322     setOperationAction(ISD::FRINT,              MVT::v8f64, Legal);
01323     setOperationAction(ISD::FNEARBYINT,         MVT::v16f32, Legal);
01324     setOperationAction(ISD::FNEARBYINT,         MVT::v8f64, Legal);
01325 
01326     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01327     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01328     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01329     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01330     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01331 
01332     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01333     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01334 
01335     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01336 
01337     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01338     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01339     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01340     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01341     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01342     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01343     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01344     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01345     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01346 
01347     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01348     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01349 
01350     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01351     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01352 
01353     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01354 
01355     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01356     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01357 
01358     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01359     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01360 
01361     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01362     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01363 
01364     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01365     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01366     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01367     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01368     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01369     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01370 
01371     if (Subtarget->hasCDI()) {
01372       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01373       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01374     }
01375 
01376     // Custom lower several nodes.
01377     for (MVT VT : MVT::vector_valuetypes()) {
01378       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01379       // Extract subvector is special because the value type
01380       // (result) is 256/128-bit but the source is 512-bit wide.
01381       if (VT.is128BitVector() || VT.is256BitVector()) {
01382         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01383       }
01384       if (VT.getVectorElementType() == MVT::i1)
01385         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01386 
01387       // Do not attempt to custom lower other non-512-bit vectors
01388       if (!VT.is512BitVector())
01389         continue;
01390 
01391       if ( EltSize >= 32) {
01392         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01393         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01394         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01395         setOperationAction(ISD::VSELECT,             VT, Legal);
01396         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01397         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01398         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01399         setOperationAction(ISD::MLOAD,               VT, Legal);
01400         setOperationAction(ISD::MSTORE,              VT, Legal);
01401       }
01402     }
01403     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01404       MVT VT = (MVT::SimpleValueType)i;
01405 
01406       // Do not attempt to promote non-512-bit vectors.
01407       if (!VT.is512BitVector())
01408         continue;
01409 
01410       setOperationAction(ISD::SELECT, VT, Promote);
01411       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01412     }
01413   }// has  AVX-512
01414 
01415   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01416     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01417     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01418 
01419     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01420     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01421 
01422     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01423     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01424     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01425     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01426     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
01427     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
01428     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
01429     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
01430     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
01431     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
01432     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
01433     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
01434     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
01435 
01436     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01437       const MVT VT = (MVT::SimpleValueType)i;
01438 
01439       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01440 
01441       // Do not attempt to promote non-512-bit vectors.
01442       if (!VT.is512BitVector())
01443         continue;
01444 
01445       if (EltSize < 32) {
01446         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01447         setOperationAction(ISD::VSELECT,             VT, Legal);
01448       }
01449     }
01450   }
01451 
01452   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01453     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01454     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01455 
01456     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01457     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01458     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v4i1, Custom);
01459     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1, Custom);
01460     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Custom);
01461     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v4i1, Custom);
01462 
01463     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
01464     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
01465     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
01466     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
01467     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
01468     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
01469   }
01470 
01471   // We want to custom lower some of our intrinsics.
01472   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01473   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01474   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01475   if (!Subtarget->is64Bit())
01476     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01477 
01478   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01479   // handle type legalization for these operations here.
01480   //
01481   // FIXME: We really should do custom legalization for addition and
01482   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01483   // than generic legalization for 64-bit multiplication-with-overflow, though.
01484   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01485     // Add/Sub/Mul with overflow operations are custom lowered.
01486     MVT VT = IntVTs[i];
01487     setOperationAction(ISD::SADDO, VT, Custom);
01488     setOperationAction(ISD::UADDO, VT, Custom);
01489     setOperationAction(ISD::SSUBO, VT, Custom);
01490     setOperationAction(ISD::USUBO, VT, Custom);
01491     setOperationAction(ISD::SMULO, VT, Custom);
01492     setOperationAction(ISD::UMULO, VT, Custom);
01493   }
01494 
01495 
01496   if (!Subtarget->is64Bit()) {
01497     // These libcalls are not available in 32-bit.
01498     setLibcallName(RTLIB::SHL_I128, nullptr);
01499     setLibcallName(RTLIB::SRL_I128, nullptr);
01500     setLibcallName(RTLIB::SRA_I128, nullptr);
01501   }
01502 
01503   // Combine sin / cos into one node or libcall if possible.
01504   if (Subtarget->hasSinCos()) {
01505     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01506     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01507     if (Subtarget->isTargetDarwin()) {
01508       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
01509       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
01510       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01511       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01512     }
01513   }
01514 
01515   if (Subtarget->isTargetWin64()) {
01516     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01517     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01518     setOperationAction(ISD::SREM, MVT::i128, Custom);
01519     setOperationAction(ISD::UREM, MVT::i128, Custom);
01520     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01521     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01522   }
01523 
01524   // We have target-specific dag combine patterns for the following nodes:
01525   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01526   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01527   setTargetDAGCombine(ISD::BITCAST);
01528   setTargetDAGCombine(ISD::VSELECT);
01529   setTargetDAGCombine(ISD::SELECT);
01530   setTargetDAGCombine(ISD::SHL);
01531   setTargetDAGCombine(ISD::SRA);
01532   setTargetDAGCombine(ISD::SRL);
01533   setTargetDAGCombine(ISD::OR);
01534   setTargetDAGCombine(ISD::AND);
01535   setTargetDAGCombine(ISD::ADD);
01536   setTargetDAGCombine(ISD::FADD);
01537   setTargetDAGCombine(ISD::FSUB);
01538   setTargetDAGCombine(ISD::FMA);
01539   setTargetDAGCombine(ISD::SUB);
01540   setTargetDAGCombine(ISD::LOAD);
01541   setTargetDAGCombine(ISD::MLOAD);
01542   setTargetDAGCombine(ISD::STORE);
01543   setTargetDAGCombine(ISD::MSTORE);
01544   setTargetDAGCombine(ISD::ZERO_EXTEND);
01545   setTargetDAGCombine(ISD::ANY_EXTEND);
01546   setTargetDAGCombine(ISD::SIGN_EXTEND);
01547   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01548   setTargetDAGCombine(ISD::TRUNCATE);
01549   setTargetDAGCombine(ISD::SINT_TO_FP);
01550   setTargetDAGCombine(ISD::SETCC);
01551   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01552   setTargetDAGCombine(ISD::BUILD_VECTOR);
01553   setTargetDAGCombine(ISD::MUL);
01554   setTargetDAGCombine(ISD::XOR);
01555 
01556   computeRegisterProperties(Subtarget->getRegisterInfo());
01557 
01558   // On Darwin, -Os means optimize for size without hurting performance,
01559   // do not reduce the limit.
01560   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01561   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01562   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01563   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01564   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01565   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01566   setPrefLoopAlignment(4); // 2^4 bytes.
01567 
01568   // Predictable cmov don't hurt on atom because it's in-order.
01569   PredictableSelectIsExpensive = !Subtarget->isAtom();
01570   EnableExtLdPromotion = true;
01571   setPrefFunctionAlignment(4); // 2^4 bytes.
01572 
01573   verifyIntrinsicTables();
01574 }
01575 
01576 // This has so far only been implemented for 64-bit MachO.
01577 bool X86TargetLowering::useLoadStackGuardNode() const {
01578   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
01579 }
01580 
01581 TargetLoweringBase::LegalizeTypeAction
01582 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01583   if (ExperimentalVectorWideningLegalization &&
01584       VT.getVectorNumElements() != 1 &&
01585       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01586     return TypeWidenVector;
01587 
01588   return TargetLoweringBase::getPreferredVectorAction(VT);
01589 }
01590 
01591 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01592   if (!VT.isVector())
01593     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01594 
01595   const unsigned NumElts = VT.getVectorNumElements();
01596   const EVT EltVT = VT.getVectorElementType();
01597   if (VT.is512BitVector()) {
01598     if (Subtarget->hasAVX512())
01599       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01600           EltVT == MVT::f32 || EltVT == MVT::f64)
01601         switch(NumElts) {
01602         case  8: return MVT::v8i1;
01603         case 16: return MVT::v16i1;
01604       }
01605     if (Subtarget->hasBWI())
01606       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01607         switch(NumElts) {
01608         case 32: return MVT::v32i1;
01609         case 64: return MVT::v64i1;
01610       }
01611   }
01612 
01613   if (VT.is256BitVector() || VT.is128BitVector()) {
01614     if (Subtarget->hasVLX())
01615       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01616           EltVT == MVT::f32 || EltVT == MVT::f64)
01617         switch(NumElts) {
01618         case 2: return MVT::v2i1;
01619         case 4: return MVT::v4i1;
01620         case 8: return MVT::v8i1;
01621       }
01622     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01623       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01624         switch(NumElts) {
01625         case  8: return MVT::v8i1;
01626         case 16: return MVT::v16i1;
01627         case 32: return MVT::v32i1;
01628       }
01629   }
01630 
01631   return VT.changeVectorElementTypeToInteger();
01632 }
01633 
01634 /// Helper for getByValTypeAlignment to determine
01635 /// the desired ByVal argument alignment.
01636 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01637   if (MaxAlign == 16)
01638     return;
01639   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01640     if (VTy->getBitWidth() == 128)
01641       MaxAlign = 16;
01642   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01643     unsigned EltAlign = 0;
01644     getMaxByValAlign(ATy->getElementType(), EltAlign);
01645     if (EltAlign > MaxAlign)
01646       MaxAlign = EltAlign;
01647   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01648     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01649       unsigned EltAlign = 0;
01650       getMaxByValAlign(STy->getElementType(i), EltAlign);
01651       if (EltAlign > MaxAlign)
01652         MaxAlign = EltAlign;
01653       if (MaxAlign == 16)
01654         break;
01655     }
01656   }
01657 }
01658 
01659 /// Return the desired alignment for ByVal aggregate
01660 /// function arguments in the caller parameter area. For X86, aggregates
01661 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01662 /// are at 4-byte boundaries.
01663 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01664   if (Subtarget->is64Bit()) {
01665     // Max of 8 and alignment of type.
01666     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01667     if (TyAlign > 8)
01668       return TyAlign;
01669     return 8;
01670   }
01671 
01672   unsigned Align = 4;
01673   if (Subtarget->hasSSE1())
01674     getMaxByValAlign(Ty, Align);
01675   return Align;
01676 }
01677 
01678 /// Returns the target specific optimal type for load
01679 /// and store operations as a result of memset, memcpy, and memmove
01680 /// lowering. If DstAlign is zero that means it's safe to destination
01681 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01682 /// means there isn't a need to check it against alignment requirement,
01683 /// probably because the source does not need to be loaded. If 'IsMemset' is
01684 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01685 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01686 /// source is constant so it does not need to be loaded.
01687 /// It returns EVT::Other if the type should be determined using generic
01688 /// target-independent logic.
01689 EVT
01690 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01691                                        unsigned DstAlign, unsigned SrcAlign,
01692                                        bool IsMemset, bool ZeroMemset,
01693                                        bool MemcpyStrSrc,
01694                                        MachineFunction &MF) const {
01695   const Function *F = MF.getFunction();
01696   if ((!IsMemset || ZeroMemset) &&
01697       !F->hasFnAttribute(Attribute::NoImplicitFloat)) {
01698     if (Size >= 16 &&
01699         (Subtarget->isUnalignedMemAccessFast() ||
01700          ((DstAlign == 0 || DstAlign >= 16) &&
01701           (SrcAlign == 0 || SrcAlign >= 16)))) {
01702       if (Size >= 32) {
01703         if (Subtarget->hasInt256())
01704           return MVT::v8i32;
01705         if (Subtarget->hasFp256())
01706           return MVT::v8f32;
01707       }
01708       if (Subtarget->hasSSE2())
01709         return MVT::v4i32;
01710       if (Subtarget->hasSSE1())
01711         return MVT::v4f32;
01712     } else if (!MemcpyStrSrc && Size >= 8 &&
01713                !Subtarget->is64Bit() &&
01714                Subtarget->hasSSE2()) {
01715       // Do not use f64 to lower memcpy if source is string constant. It's
01716       // better to use i32 to avoid the loads.
01717       return MVT::f64;
01718     }
01719   }
01720   if (Subtarget->is64Bit() && Size >= 8)
01721     return MVT::i64;
01722   return MVT::i32;
01723 }
01724 
01725 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01726   if (VT == MVT::f32)
01727     return X86ScalarSSEf32;
01728   else if (VT == MVT::f64)
01729     return X86ScalarSSEf64;
01730   return true;
01731 }
01732 
01733 bool
01734 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01735                                                   unsigned,
01736                                                   unsigned,
01737                                                   bool *Fast) const {
01738   if (Fast)
01739     *Fast = Subtarget->isUnalignedMemAccessFast();
01740   return true;
01741 }
01742 
01743 /// Return the entry encoding for a jump table in the
01744 /// current function.  The returned value is a member of the
01745 /// MachineJumpTableInfo::JTEntryKind enum.
01746 unsigned X86TargetLowering::getJumpTableEncoding() const {
01747   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01748   // symbol.
01749   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01750       Subtarget->isPICStyleGOT())
01751     return MachineJumpTableInfo::EK_Custom32;
01752 
01753   // Otherwise, use the normal jump table encoding heuristics.
01754   return TargetLowering::getJumpTableEncoding();
01755 }
01756 
01757 const MCExpr *
01758 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01759                                              const MachineBasicBlock *MBB,
01760                                              unsigned uid,MCContext &Ctx) const{
01761   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01762          Subtarget->isPICStyleGOT());
01763   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01764   // entries.
01765   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01766                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01767 }
01768 
01769 /// Returns relocation base for the given PIC jumptable.
01770 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01771                                                     SelectionDAG &DAG) const {
01772   if (!Subtarget->is64Bit())
01773     // This doesn't have SDLoc associated with it, but is not really the
01774     // same as a Register.
01775     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01776   return Table;
01777 }
01778 
01779 /// This returns the relocation base for the given PIC jumptable,
01780 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
01781 const MCExpr *X86TargetLowering::
01782 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01783                              MCContext &Ctx) const {
01784   // X86-64 uses RIP relative addressing based on the jump table label.
01785   if (Subtarget->isPICStyleRIPRel())
01786     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01787 
01788   // Otherwise, the reference is relative to the PIC base.
01789   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01790 }
01791 
01792 std::pair<const TargetRegisterClass *, uint8_t>
01793 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
01794                                            MVT VT) const {
01795   const TargetRegisterClass *RRC = nullptr;
01796   uint8_t Cost = 1;
01797   switch (VT.SimpleTy) {
01798   default:
01799     return TargetLowering::findRepresentativeClass(TRI, VT);
01800   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01801     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01802     break;
01803   case MVT::x86mmx:
01804     RRC = &X86::VR64RegClass;
01805     break;
01806   case MVT::f32: case MVT::f64:
01807   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01808   case MVT::v4f32: case MVT::v2f64:
01809   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01810   case MVT::v4f64:
01811     RRC = &X86::VR128RegClass;
01812     break;
01813   }
01814   return std::make_pair(RRC, Cost);
01815 }
01816 
01817 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01818                                                unsigned &Offset) const {
01819   if (!Subtarget->isTargetLinux())
01820     return false;
01821 
01822   if (Subtarget->is64Bit()) {
01823     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01824     Offset = 0x28;
01825     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01826       AddressSpace = 256;
01827     else
01828       AddressSpace = 257;
01829   } else {
01830     // %gs:0x14 on i386
01831     Offset = 0x14;
01832     AddressSpace = 256;
01833   }
01834   return true;
01835 }
01836 
01837 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01838                                             unsigned DestAS) const {
01839   assert(SrcAS != DestAS && "Expected different address spaces!");
01840 
01841   return SrcAS < 256 && DestAS < 256;
01842 }
01843 
01844 //===----------------------------------------------------------------------===//
01845 //               Return Value Calling Convention Implementation
01846 //===----------------------------------------------------------------------===//
01847 
01848 #include "X86GenCallingConv.inc"
01849 
01850 bool
01851 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01852                                   MachineFunction &MF, bool isVarArg,
01853                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01854                         LLVMContext &Context) const {
01855   SmallVector<CCValAssign, 16> RVLocs;
01856   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01857   return CCInfo.CheckReturn(Outs, RetCC_X86);
01858 }
01859 
01860 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01861   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01862   return ScratchRegs;
01863 }
01864 
01865 SDValue
01866 X86TargetLowering::LowerReturn(SDValue Chain,
01867                                CallingConv::ID CallConv, bool isVarArg,
01868                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01869                                const SmallVectorImpl<SDValue> &OutVals,
01870                                SDLoc dl, SelectionDAG &DAG) const {
01871   MachineFunction &MF = DAG.getMachineFunction();
01872   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01873 
01874   SmallVector<CCValAssign, 16> RVLocs;
01875   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01876   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01877 
01878   SDValue Flag;
01879   SmallVector<SDValue, 6> RetOps;
01880   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01881   // Operand #1 = Bytes To Pop
01882   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01883                    MVT::i16));
01884 
01885   // Copy the result values into the output registers.
01886   for (unsigned i = 0; i != RVLocs.size(); ++i) {
01887     CCValAssign &VA = RVLocs[i];
01888     assert(VA.isRegLoc() && "Can only return in registers!");
01889     SDValue ValToCopy = OutVals[i];
01890     EVT ValVT = ValToCopy.getValueType();
01891 
01892     // Promote values to the appropriate types.
01893     if (VA.getLocInfo() == CCValAssign::SExt)
01894       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
01895     else if (VA.getLocInfo() == CCValAssign::ZExt)
01896       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
01897     else if (VA.getLocInfo() == CCValAssign::AExt)
01898       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
01899     else if (VA.getLocInfo() == CCValAssign::BCvt)
01900       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
01901 
01902     assert(VA.getLocInfo() != CCValAssign::FPExt &&
01903            "Unexpected FP-extend for return value.");
01904 
01905     // If this is x86-64, and we disabled SSE, we can't return FP values,
01906     // or SSE or MMX vectors.
01907     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
01908          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
01909           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
01910       report_fatal_error("SSE register return with SSE disabled");
01911     }
01912     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
01913     // llvm-gcc has never done it right and no one has noticed, so this
01914     // should be OK for now.
01915     if (ValVT == MVT::f64 &&
01916         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
01917       report_fatal_error("SSE2 register return with SSE2 disabled");
01918 
01919     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
01920     // the RET instruction and handled by the FP Stackifier.
01921     if (VA.getLocReg() == X86::FP0 ||
01922         VA.getLocReg() == X86::FP1) {
01923       // If this is a copy from an xmm register to ST(0), use an FPExtend to
01924       // change the value to the FP stack register class.
01925       if (isScalarFPTypeInSSEReg(VA.getValVT()))
01926         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
01927       RetOps.push_back(ValToCopy);
01928       // Don't emit a copytoreg.
01929       continue;
01930     }
01931 
01932     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
01933     // which is returned in RAX / RDX.
01934     if (Subtarget->is64Bit()) {
01935       if (ValVT == MVT::x86mmx) {
01936         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
01937           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
01938           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
01939                                   ValToCopy);
01940           // If we don't have SSE2 available, convert to v4f32 so the generated
01941           // register is legal.
01942           if (!Subtarget->hasSSE2())
01943             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
01944         }
01945       }
01946     }
01947 
01948     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
01949     Flag = Chain.getValue(1);
01950     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
01951   }
01952 
01953   // The x86-64 ABIs require that for returning structs by value we copy
01954   // the sret argument into %rax/%eax (depending on ABI) for the return.
01955   // Win32 requires us to put the sret argument to %eax as well.
01956   // We saved the argument into a virtual register in the entry block,
01957   // so now we copy the value out and into %rax/%eax.
01958   //
01959   // Checking Function.hasStructRetAttr() here is insufficient because the IR
01960   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
01961   // false, then an sret argument may be implicitly inserted in the SelDAG. In
01962   // either case FuncInfo->setSRetReturnReg() will have been called.
01963   if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
01964     assert((Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) &&
01965            "No need for an sret register");
01966     SDValue Val = DAG.getCopyFromReg(Chain, dl, SRetReg, getPointerTy());
01967 
01968     unsigned RetValReg
01969         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
01970           X86::RAX : X86::EAX;
01971     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
01972     Flag = Chain.getValue(1);
01973 
01974     // RAX/EAX now acts like a return value.
01975     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
01976   }
01977 
01978   RetOps[0] = Chain;  // Update chain.
01979 
01980   // Add the flag if we have it.
01981   if (Flag.getNode())
01982     RetOps.push_back(Flag);
01983 
01984   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
01985 }
01986 
01987 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
01988   if (N->getNumValues() != 1)
01989     return false;
01990   if (!N->hasNUsesOfValue(1, 0))
01991     return false;
01992 
01993   SDValue TCChain = Chain;
01994   SDNode *Copy = *N->use_begin();
01995   if (Copy->getOpcode() == ISD::CopyToReg) {
01996     // If the copy has a glue operand, we conservatively assume it isn't safe to
01997     // perform a tail call.
01998     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
01999       return false;
02000     TCChain = Copy->getOperand(0);
02001   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02002     return false;
02003 
02004   bool HasRet = false;
02005   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02006        UI != UE; ++UI) {
02007     if (UI->getOpcode() != X86ISD::RET_FLAG)
02008       return false;
02009     // If we are returning more than one value, we can definitely
02010     // not make a tail call see PR19530
02011     if (UI->getNumOperands() > 4)
02012       return false;
02013     if (UI->getNumOperands() == 4 &&
02014         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02015       return false;
02016     HasRet = true;
02017   }
02018 
02019   if (!HasRet)
02020     return false;
02021 
02022   Chain = TCChain;
02023   return true;
02024 }
02025 
02026 EVT
02027 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02028                                             ISD::NodeType ExtendKind) const {
02029   MVT ReturnMVT;
02030   // TODO: Is this also valid on 32-bit?
02031   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02032     ReturnMVT = MVT::i8;
02033   else
02034     ReturnMVT = MVT::i32;
02035 
02036   EVT MinVT = getRegisterType(Context, ReturnMVT);
02037   return VT.bitsLT(MinVT) ? MinVT : VT;
02038 }
02039 
02040 /// Lower the result values of a call into the
02041 /// appropriate copies out of appropriate physical registers.
02042 ///
02043 SDValue
02044 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02045                                    CallingConv::ID CallConv, bool isVarArg,
02046                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02047                                    SDLoc dl, SelectionDAG &DAG,
02048                                    SmallVectorImpl<SDValue> &InVals) const {
02049 
02050   // Assign locations to each value returned by this call.
02051   SmallVector<CCValAssign, 16> RVLocs;
02052   bool Is64Bit = Subtarget->is64Bit();
02053   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02054                  *DAG.getContext());
02055   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02056 
02057   // Copy all of the result registers out of their specified physreg.
02058   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02059     CCValAssign &VA = RVLocs[i];
02060     EVT CopyVT = VA.getValVT();
02061 
02062     // If this is x86-64, and we disabled SSE, we can't return FP values
02063     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02064         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02065       report_fatal_error("SSE register return with SSE disabled");
02066     }
02067 
02068     // If we prefer to use the value in xmm registers, copy it out as f80 and
02069     // use a truncate to move it from fp stack reg to xmm reg.
02070     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02071         isScalarFPTypeInSSEReg(VA.getValVT()))
02072       CopyVT = MVT::f80;
02073 
02074     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02075                                CopyVT, InFlag).getValue(1);
02076     SDValue Val = Chain.getValue(0);
02077 
02078     if (CopyVT != VA.getValVT())
02079       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02080                         // This truncation won't change the value.
02081                         DAG.getIntPtrConstant(1));
02082 
02083     InFlag = Chain.getValue(2);
02084     InVals.push_back(Val);
02085   }
02086 
02087   return Chain;
02088 }
02089 
02090 //===----------------------------------------------------------------------===//
02091 //                C & StdCall & Fast Calling Convention implementation
02092 //===----------------------------------------------------------------------===//
02093 //  StdCall calling convention seems to be standard for many Windows' API
02094 //  routines and around. It differs from C calling convention just a little:
02095 //  callee should clean up the stack, not caller. Symbols should be also
02096 //  decorated in some fancy way :) It doesn't support any vector arguments.
02097 //  For info on fast calling convention see Fast Calling Convention (tail call)
02098 //  implementation LowerX86_32FastCCCallTo.
02099 
02100 /// CallIsStructReturn - Determines whether a call uses struct return
02101 /// semantics.
02102 enum StructReturnType {
02103   NotStructReturn,
02104   RegStructReturn,
02105   StackStructReturn
02106 };
02107 static StructReturnType
02108 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02109   if (Outs.empty())
02110     return NotStructReturn;
02111 
02112   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02113   if (!Flags.isSRet())
02114     return NotStructReturn;
02115   if (Flags.isInReg())
02116     return RegStructReturn;
02117   return StackStructReturn;
02118 }
02119 
02120 /// Determines whether a function uses struct return semantics.
02121 static StructReturnType
02122 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02123   if (Ins.empty())
02124     return NotStructReturn;
02125 
02126   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02127   if (!Flags.isSRet())
02128     return NotStructReturn;
02129   if (Flags.isInReg())
02130     return RegStructReturn;
02131   return StackStructReturn;
02132 }
02133 
02134 /// Make a copy of an aggregate at address specified by "Src" to address
02135 /// "Dst" with size and alignment information specified by the specific
02136 /// parameter attribute. The copy will be passed as a byval function parameter.
02137 static SDValue
02138 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02139                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02140                           SDLoc dl) {
02141   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02142 
02143   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02144                        /*isVolatile*/false, /*AlwaysInline=*/true,
02145                        MachinePointerInfo(), MachinePointerInfo());
02146 }
02147 
02148 /// Return true if the calling convention is one that
02149 /// supports tail call optimization.
02150 static bool IsTailCallConvention(CallingConv::ID CC) {
02151   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02152           CC == CallingConv::HiPE);
02153 }
02154 
02155 /// \brief Return true if the calling convention is a C calling convention.
02156 static bool IsCCallConvention(CallingConv::ID CC) {
02157   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02158           CC == CallingConv::X86_64_SysV);
02159 }
02160 
02161 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02162   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02163     return false;
02164 
02165   CallSite CS(CI);
02166   CallingConv::ID CalleeCC = CS.getCallingConv();
02167   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02168     return false;
02169 
02170   return true;
02171 }
02172 
02173 /// Return true if the function is being made into
02174 /// a tailcall target by changing its ABI.
02175 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02176                                    bool GuaranteedTailCallOpt) {
02177   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02178 }
02179 
02180 SDValue
02181 X86TargetLowering::LowerMemArgument(SDValue Chain,
02182                                     CallingConv::ID CallConv,
02183                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02184                                     SDLoc dl, SelectionDAG &DAG,
02185                                     const CCValAssign &VA,
02186                                     MachineFrameInfo *MFI,
02187                                     unsigned i) const {
02188   // Create the nodes corresponding to a load from this parameter slot.
02189   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02190   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02191       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02192   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02193   EVT ValVT;
02194 
02195   // If value is passed by pointer we have address passed instead of the value
02196   // itself.
02197   if (VA.getLocInfo() == CCValAssign::Indirect)
02198     ValVT = VA.getLocVT();
02199   else
02200     ValVT = VA.getValVT();
02201 
02202   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02203   // changed with more analysis.
02204   // In case of tail call optimization mark all arguments mutable. Since they
02205   // could be overwritten by lowering of arguments in case of a tail call.
02206   if (Flags.isByVal()) {
02207     unsigned Bytes = Flags.getByValSize();
02208     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02209     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02210     return DAG.getFrameIndex(FI, getPointerTy());
02211   } else {
02212     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02213                                     VA.getLocMemOffset(), isImmutable);
02214     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02215     return DAG.getLoad(ValVT, dl, Chain, FIN,
02216                        MachinePointerInfo::getFixedStack(FI),
02217                        false, false, false, 0);
02218   }
02219 }
02220 
02221 // FIXME: Get this from tablegen.
02222 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02223                                                 const X86Subtarget *Subtarget) {
02224   assert(Subtarget->is64Bit());
02225 
02226   if (Subtarget->isCallingConvWin64(CallConv)) {
02227     static const MCPhysReg GPR64ArgRegsWin64[] = {
02228       X86::RCX, X86::RDX, X86::R8,  X86::R9
02229     };
02230     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02231   }
02232 
02233   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02234     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02235   };
02236   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02237 }
02238 
02239 // FIXME: Get this from tablegen.
02240 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02241                                                 CallingConv::ID CallConv,
02242                                                 const X86Subtarget *Subtarget) {
02243   assert(Subtarget->is64Bit());
02244   if (Subtarget->isCallingConvWin64(CallConv)) {
02245     // The XMM registers which might contain var arg parameters are shadowed
02246     // in their paired GPR.  So we only need to save the GPR to their home
02247     // slots.
02248     // TODO: __vectorcall will change this.
02249     return None;
02250   }
02251 
02252   const Function *Fn = MF.getFunction();
02253   bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
02254   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02255          "SSE register cannot be used when SSE is disabled!");
02256   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02257       !Subtarget->hasSSE1())
02258     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02259     // registers.
02260     return None;
02261 
02262   static const MCPhysReg XMMArgRegs64Bit[] = {
02263     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02264     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02265   };
02266   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02267 }
02268 
02269 SDValue
02270 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02271                                         CallingConv::ID CallConv,
02272                                         bool isVarArg,
02273                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02274                                         SDLoc dl,
02275                                         SelectionDAG &DAG,
02276                                         SmallVectorImpl<SDValue> &InVals)
02277                                           const {
02278   MachineFunction &MF = DAG.getMachineFunction();
02279   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02280 
02281   const Function* Fn = MF.getFunction();
02282   if (Fn->hasExternalLinkage() &&
02283       Subtarget->isTargetCygMing() &&
02284       Fn->getName() == "main")
02285     FuncInfo->setForceFramePointer(true);
02286 
02287   MachineFrameInfo *MFI = MF.getFrameInfo();
02288   bool Is64Bit = Subtarget->is64Bit();
02289   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02290 
02291   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02292          "Var args not supported with calling convention fastcc, ghc or hipe");
02293 
02294   // Assign locations to all of the incoming arguments.
02295   SmallVector<CCValAssign, 16> ArgLocs;
02296   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02297 
02298   // Allocate shadow area for Win64
02299   if (IsWin64)
02300     CCInfo.AllocateStack(32, 8);
02301 
02302   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02303 
02304   unsigned LastVal = ~0U;
02305   SDValue ArgValue;
02306   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02307     CCValAssign &VA = ArgLocs[i];
02308     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02309     // places.
02310     assert(VA.getValNo() != LastVal &&
02311            "Don't support value assigned to multiple locs yet");
02312     (void)LastVal;
02313     LastVal = VA.getValNo();
02314 
02315     if (VA.isRegLoc()) {
02316       EVT RegVT = VA.getLocVT();
02317       const TargetRegisterClass *RC;
02318       if (RegVT == MVT::i32)
02319         RC = &X86::GR32RegClass;
02320       else if (Is64Bit && RegVT == MVT::i64)
02321         RC = &X86::GR64RegClass;
02322       else if (RegVT == MVT::f32)
02323         RC = &X86::FR32RegClass;
02324       else if (RegVT == MVT::f64)
02325         RC = &X86::FR64RegClass;
02326       else if (RegVT.is512BitVector())
02327         RC = &X86::VR512RegClass;
02328       else if (RegVT.is256BitVector())
02329         RC = &X86::VR256RegClass;
02330       else if (RegVT.is128BitVector())
02331         RC = &X86::VR128RegClass;
02332       else if (RegVT == MVT::x86mmx)
02333         RC = &X86::VR64RegClass;
02334       else if (RegVT == MVT::i1)
02335         RC = &X86::VK1RegClass;
02336       else if (RegVT == MVT::v8i1)
02337         RC = &X86::VK8RegClass;
02338       else if (RegVT == MVT::v16i1)
02339         RC = &X86::VK16RegClass;
02340       else if (RegVT == MVT::v32i1)
02341         RC = &X86::VK32RegClass;
02342       else if (RegVT == MVT::v64i1)
02343         RC = &X86::VK64RegClass;
02344       else
02345         llvm_unreachable("Unknown argument type!");
02346 
02347       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02348       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02349 
02350       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02351       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02352       // right size.
02353       if (VA.getLocInfo() == CCValAssign::SExt)
02354         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02355                                DAG.getValueType(VA.getValVT()));
02356       else if (VA.getLocInfo() == CCValAssign::ZExt)
02357         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02358                                DAG.getValueType(VA.getValVT()));
02359       else if (VA.getLocInfo() == CCValAssign::BCvt)
02360         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02361 
02362       if (VA.isExtInLoc()) {
02363         // Handle MMX values passed in XMM regs.
02364         if (RegVT.isVector())
02365           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02366         else
02367           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02368       }
02369     } else {
02370       assert(VA.isMemLoc());
02371       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02372     }
02373 
02374     // If value is passed via pointer - do a load.
02375     if (VA.getLocInfo() == CCValAssign::Indirect)
02376       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02377                              MachinePointerInfo(), false, false, false, 0);
02378 
02379     InVals.push_back(ArgValue);
02380   }
02381 
02382   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02383     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02384       // The x86-64 ABIs require that for returning structs by value we copy
02385       // the sret argument into %rax/%eax (depending on ABI) for the return.
02386       // Win32 requires us to put the sret argument to %eax as well.
02387       // Save the argument into a virtual register so that we can access it
02388       // from the return points.
02389       if (Ins[i].Flags.isSRet()) {
02390         unsigned Reg = FuncInfo->getSRetReturnReg();
02391         if (!Reg) {
02392           MVT PtrTy = getPointerTy();
02393           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02394           FuncInfo->setSRetReturnReg(Reg);
02395         }
02396         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02397         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02398         break;
02399       }
02400     }
02401   }
02402 
02403   unsigned StackSize = CCInfo.getNextStackOffset();
02404   // Align stack specially for tail calls.
02405   if (FuncIsMadeTailCallSafe(CallConv,
02406                              MF.getTarget().Options.GuaranteedTailCallOpt))
02407     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02408 
02409   // If the function takes variable number of arguments, make a frame index for
02410   // the start of the first vararg value... for expansion of llvm.va_start. We
02411   // can skip this if there are no va_start calls.
02412   if (MFI->hasVAStart() &&
02413       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02414                    CallConv != CallingConv::X86_ThisCall))) {
02415     FuncInfo->setVarArgsFrameIndex(
02416         MFI->CreateFixedObject(1, StackSize, true));
02417   }
02418 
02419   // Figure out if XMM registers are in use.
02420   assert(!(MF.getTarget().Options.UseSoftFloat &&
02421            Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
02422          "SSE register cannot be used when SSE is disabled!");
02423 
02424   // 64-bit calling conventions support varargs and register parameters, so we
02425   // have to do extra work to spill them in the prologue.
02426   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
02427     // Find the first unallocated argument registers.
02428     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02429     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02430     unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
02431     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
02432     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02433            "SSE register cannot be used when SSE is disabled!");
02434 
02435     // Gather all the live in physical registers.
02436     SmallVector<SDValue, 6> LiveGPRs;
02437     SmallVector<SDValue, 8> LiveXMMRegs;
02438     SDValue ALVal;
02439     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02440       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02441       LiveGPRs.push_back(
02442           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02443     }
02444     if (!ArgXMMs.empty()) {
02445       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02446       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02447       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02448         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02449         LiveXMMRegs.push_back(
02450             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02451       }
02452     }
02453 
02454     if (IsWin64) {
02455       const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
02456       // Get to the caller-allocated home save location.  Add 8 to account
02457       // for the return address.
02458       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02459       FuncInfo->setRegSaveFrameIndex(
02460           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02461       // Fixup to set vararg frame on shadow area (4 x i64).
02462       if (NumIntRegs < 4)
02463         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02464     } else {
02465       // For X86-64, if there are vararg parameters that are passed via
02466       // registers, then we must store them to their spots on the stack so
02467       // they may be loaded by deferencing the result of va_next.
02468       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02469       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02470       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02471           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02472     }
02473 
02474     // Store the integer parameter registers.
02475     SmallVector<SDValue, 8> MemOps;
02476     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02477                                       getPointerTy());
02478     unsigned Offset = FuncInfo->getVarArgsGPOffset();
02479     for (SDValue Val : LiveGPRs) {
02480       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02481                                 DAG.getIntPtrConstant(Offset));
02482       SDValue Store =
02483         DAG.getStore(Val.getValue(1), dl, Val, FIN,
02484                      MachinePointerInfo::getFixedStack(
02485                        FuncInfo->getRegSaveFrameIndex(), Offset),
02486                      false, false, 0);
02487       MemOps.push_back(Store);
02488       Offset += 8;
02489     }
02490 
02491     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02492       // Now store the XMM (fp + vector) parameter registers.
02493       SmallVector<SDValue, 12> SaveXMMOps;
02494       SaveXMMOps.push_back(Chain);
02495       SaveXMMOps.push_back(ALVal);
02496       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02497                              FuncInfo->getRegSaveFrameIndex()));
02498       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02499                              FuncInfo->getVarArgsFPOffset()));
02500       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02501                         LiveXMMRegs.end());
02502       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02503                                    MVT::Other, SaveXMMOps));
02504     }
02505 
02506     if (!MemOps.empty())
02507       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02508   }
02509 
02510   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
02511     // Find the largest legal vector type.
02512     MVT VecVT = MVT::Other;
02513     // FIXME: Only some x86_32 calling conventions support AVX512.
02514     if (Subtarget->hasAVX512() &&
02515         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
02516                      CallConv == CallingConv::Intel_OCL_BI)))
02517       VecVT = MVT::v16f32;
02518     else if (Subtarget->hasAVX())
02519       VecVT = MVT::v8f32;
02520     else if (Subtarget->hasSSE2())
02521       VecVT = MVT::v4f32;
02522 
02523     // We forward some GPRs and some vector types.
02524     SmallVector<MVT, 2> RegParmTypes;
02525     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
02526     RegParmTypes.push_back(IntVT);
02527     if (VecVT != MVT::Other)
02528       RegParmTypes.push_back(VecVT);
02529 
02530     // Compute the set of forwarded registers. The rest are scratch.
02531     SmallVectorImpl<ForwardedRegister> &Forwards =
02532         FuncInfo->getForwardedMustTailRegParms();
02533     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
02534 
02535     // Conservatively forward AL on x86_64, since it might be used for varargs.
02536     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
02537       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02538       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
02539     }
02540 
02541     // Copy all forwards from physical to virtual registers.
02542     for (ForwardedRegister &F : Forwards) {
02543       // FIXME: Can we use a less constrained schedule?
02544       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02545       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
02546       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
02547     }
02548   }
02549 
02550   // Some CCs need callee pop.
02551   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02552                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02553     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02554   } else {
02555     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02556     // If this is an sret function, the return should pop the hidden pointer.
02557     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02558         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02559         argsAreStructReturn(Ins) == StackStructReturn)
02560       FuncInfo->setBytesToPopOnReturn(4);
02561   }
02562 
02563   if (!Is64Bit) {
02564     // RegSaveFrameIndex is X86-64 only.
02565     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02566     if (CallConv == CallingConv::X86_FastCall ||
02567         CallConv == CallingConv::X86_ThisCall)
02568       // fastcc functions can't have varargs.
02569       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02570   }
02571 
02572   FuncInfo->setArgumentStackSize(StackSize);
02573 
02574   return Chain;
02575 }
02576 
02577 SDValue
02578 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02579                                     SDValue StackPtr, SDValue Arg,
02580                                     SDLoc dl, SelectionDAG &DAG,
02581                                     const CCValAssign &VA,
02582                                     ISD::ArgFlagsTy Flags) const {
02583   unsigned LocMemOffset = VA.getLocMemOffset();
02584   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02585   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02586   if (Flags.isByVal())
02587     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02588 
02589   return DAG.getStore(Chain, dl, Arg, PtrOff,
02590                       MachinePointerInfo::getStack(LocMemOffset),
02591                       false, false, 0);
02592 }
02593 
02594 /// Emit a load of return address if tail call
02595 /// optimization is performed and it is required.
02596 SDValue
02597 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02598                                            SDValue &OutRetAddr, SDValue Chain,
02599                                            bool IsTailCall, bool Is64Bit,
02600                                            int FPDiff, SDLoc dl) const {
02601   // Adjust the Return address stack slot.
02602   EVT VT = getPointerTy();
02603   OutRetAddr = getReturnAddressFrameIndex(DAG);
02604 
02605   // Load the "old" Return address.
02606   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02607                            false, false, false, 0);
02608   return SDValue(OutRetAddr.getNode(), 1);
02609 }
02610 
02611 /// Emit a store of the return address if tail call
02612 /// optimization is performed and it is required (FPDiff!=0).
02613 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02614                                         SDValue Chain, SDValue RetAddrFrIdx,
02615                                         EVT PtrVT, unsigned SlotSize,
02616                                         int FPDiff, SDLoc dl) {
02617   // Store the return address to the appropriate stack slot.
02618   if (!FPDiff) return Chain;
02619   // Calculate the new stack slot for the return address.
02620   int NewReturnAddrFI =
02621     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02622                                          false);
02623   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02624   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02625                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02626                        false, false, 0);
02627   return Chain;
02628 }
02629 
02630 SDValue
02631 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02632                              SmallVectorImpl<SDValue> &InVals) const {
02633   SelectionDAG &DAG                     = CLI.DAG;
02634   SDLoc &dl                             = CLI.DL;
02635   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02636   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02637   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02638   SDValue Chain                         = CLI.Chain;
02639   SDValue Callee                        = CLI.Callee;
02640   CallingConv::ID CallConv              = CLI.CallConv;
02641   bool &isTailCall                      = CLI.IsTailCall;
02642   bool isVarArg                         = CLI.IsVarArg;
02643 
02644   MachineFunction &MF = DAG.getMachineFunction();
02645   bool Is64Bit        = Subtarget->is64Bit();
02646   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02647   StructReturnType SR = callIsStructReturn(Outs);
02648   bool IsSibcall      = false;
02649   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02650 
02651   if (MF.getTarget().Options.DisableTailCalls)
02652     isTailCall = false;
02653 
02654   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02655   if (IsMustTail) {
02656     // Force this to be a tail call.  The verifier rules are enough to ensure
02657     // that we can lower this successfully without moving the return address
02658     // around.
02659     isTailCall = true;
02660   } else if (isTailCall) {
02661     // Check if it's really possible to do a tail call.
02662     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02663                     isVarArg, SR != NotStructReturn,
02664                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02665                     Outs, OutVals, Ins, DAG);
02666 
02667     // Sibcalls are automatically detected tailcalls which do not require
02668     // ABI changes.
02669     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02670       IsSibcall = true;
02671 
02672     if (isTailCall)
02673       ++NumTailCalls;
02674   }
02675 
02676   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02677          "Var args not supported with calling convention fastcc, ghc or hipe");
02678 
02679   // Analyze operands of the call, assigning locations to each operand.
02680   SmallVector<CCValAssign, 16> ArgLocs;
02681   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02682 
02683   // Allocate shadow area for Win64
02684   if (IsWin64)
02685     CCInfo.AllocateStack(32, 8);
02686 
02687   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02688 
02689   // Get a count of how many bytes are to be pushed on the stack.
02690   unsigned NumBytes = CCInfo.getNextStackOffset();
02691   if (IsSibcall)
02692     // This is a sibcall. The memory operands are available in caller's
02693     // own caller's stack.
02694     NumBytes = 0;
02695   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02696            IsTailCallConvention(CallConv))
02697     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02698 
02699   int FPDiff = 0;
02700   if (isTailCall && !IsSibcall && !IsMustTail) {
02701     // Lower arguments at fp - stackoffset + fpdiff.
02702     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02703 
02704     FPDiff = NumBytesCallerPushed - NumBytes;
02705 
02706     // Set the delta of movement of the returnaddr stackslot.
02707     // But only set if delta is greater than previous delta.
02708     if (FPDiff < X86Info->getTCReturnAddrDelta())
02709       X86Info->setTCReturnAddrDelta(FPDiff);
02710   }
02711 
02712   unsigned NumBytesToPush = NumBytes;
02713   unsigned NumBytesToPop = NumBytes;
02714 
02715   // If we have an inalloca argument, all stack space has already been allocated
02716   // for us and be right at the top of the stack.  We don't support multiple
02717   // arguments passed in memory when using inalloca.
02718   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02719     NumBytesToPush = 0;
02720     if (!ArgLocs.back().isMemLoc())
02721       report_fatal_error("cannot use inalloca attribute on a register "
02722                          "parameter");
02723     if (ArgLocs.back().getLocMemOffset() != 0)
02724       report_fatal_error("any parameter with the inalloca attribute must be "
02725                          "the only memory argument");
02726   }
02727 
02728   if (!IsSibcall)
02729     Chain = DAG.getCALLSEQ_START(
02730         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02731 
02732   SDValue RetAddrFrIdx;
02733   // Load return address for tail calls.
02734   if (isTailCall && FPDiff)
02735     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02736                                     Is64Bit, FPDiff, dl);
02737 
02738   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02739   SmallVector<SDValue, 8> MemOpChains;
02740   SDValue StackPtr;
02741 
02742   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02743   // of tail call optimization arguments are handle later.
02744   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
02745   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02746     // Skip inalloca arguments, they have already been written.
02747     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02748     if (Flags.isInAlloca())
02749       continue;
02750 
02751     CCValAssign &VA = ArgLocs[i];
02752     EVT RegVT = VA.getLocVT();
02753     SDValue Arg = OutVals[i];
02754     bool isByVal = Flags.isByVal();
02755 
02756     // Promote the value if needed.
02757     switch (VA.getLocInfo()) {
02758     default: llvm_unreachable("Unknown loc info!");
02759     case CCValAssign::Full: break;
02760     case CCValAssign::SExt:
02761       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02762       break;
02763     case CCValAssign::ZExt:
02764       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02765       break;
02766     case CCValAssign::AExt:
02767       if (RegVT.is128BitVector()) {
02768         // Special case: passing MMX values in XMM registers.
02769         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02770         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02771         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02772       } else
02773         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02774       break;
02775     case CCValAssign::BCvt:
02776       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02777       break;
02778     case CCValAssign::Indirect: {
02779       // Store the argument.
02780       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02781       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02782       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02783                            MachinePointerInfo::getFixedStack(FI),
02784                            false, false, 0);
02785       Arg = SpillSlot;
02786       break;
02787     }
02788     }
02789 
02790     if (VA.isRegLoc()) {
02791       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02792       if (isVarArg && IsWin64) {
02793         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02794         // shadow reg if callee is a varargs function.
02795         unsigned ShadowReg = 0;
02796         switch (VA.getLocReg()) {
02797         case X86::XMM0: ShadowReg = X86::RCX; break;
02798         case X86::XMM1: ShadowReg = X86::RDX; break;
02799         case X86::XMM2: ShadowReg = X86::R8; break;
02800         case X86::XMM3: ShadowReg = X86::R9; break;
02801         }
02802         if (ShadowReg)
02803           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02804       }
02805     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02806       assert(VA.isMemLoc());
02807       if (!StackPtr.getNode())
02808         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02809                                       getPointerTy());
02810       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02811                                              dl, DAG, VA, Flags));
02812     }
02813   }
02814 
02815   if (!MemOpChains.empty())
02816     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02817 
02818   if (Subtarget->isPICStyleGOT()) {
02819     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02820     // GOT pointer.
02821     if (!isTailCall) {
02822       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02823                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02824     } else {
02825       // If we are tail calling and generating PIC/GOT style code load the
02826       // address of the callee into ECX. The value in ecx is used as target of
02827       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02828       // for tail calls on PIC/GOT architectures. Normally we would just put the
02829       // address of GOT into ebx and then call target@PLT. But for tail calls
02830       // ebx would be restored (since ebx is callee saved) before jumping to the
02831       // target@PLT.
02832 
02833       // Note: The actual moving to ECX is done further down.
02834       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02835       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02836           !G->getGlobal()->hasProtectedVisibility())
02837         Callee = LowerGlobalAddress(Callee, DAG);
02838       else if (isa<ExternalSymbolSDNode>(Callee))
02839         Callee = LowerExternalSymbol(Callee, DAG);
02840     }
02841   }
02842 
02843   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02844     // From AMD64 ABI document:
02845     // For calls that may call functions that use varargs or stdargs
02846     // (prototype-less calls or calls to functions containing ellipsis (...) in
02847     // the declaration) %al is used as hidden argument to specify the number
02848     // of SSE registers used. The contents of %al do not need to match exactly
02849     // the number of registers, but must be an ubound on the number of SSE
02850     // registers used and is in the range 0 - 8 inclusive.
02851 
02852     // Count the number of XMM registers allocated.
02853     static const MCPhysReg XMMArgRegs[] = {
02854       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02855       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02856     };
02857     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
02858     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02859            && "SSE registers cannot be used when SSE is disabled");
02860 
02861     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02862                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02863   }
02864 
02865   if (isVarArg && IsMustTail) {
02866     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02867     for (const auto &F : Forwards) {
02868       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02869       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02870     }
02871   }
02872 
02873   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02874   // don't need this because the eligibility check rejects calls that require
02875   // shuffling arguments passed in memory.
02876   if (!IsSibcall && isTailCall) {
02877     // Force all the incoming stack arguments to be loaded from the stack
02878     // before any new outgoing arguments are stored to the stack, because the
02879     // outgoing stack slots may alias the incoming argument stack slots, and
02880     // the alias isn't otherwise explicit. This is slightly more conservative
02881     // than necessary, because it means that each store effectively depends
02882     // on every argument instead of just those arguments it would clobber.
02883     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02884 
02885     SmallVector<SDValue, 8> MemOpChains2;
02886     SDValue FIN;
02887     int FI = 0;
02888     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02889       CCValAssign &VA = ArgLocs[i];
02890       if (VA.isRegLoc())
02891         continue;
02892       assert(VA.isMemLoc());
02893       SDValue Arg = OutVals[i];
02894       ISD::ArgFlagsTy Flags = Outs[i].Flags;
02895       // Skip inalloca arguments.  They don't require any work.
02896       if (Flags.isInAlloca())
02897         continue;
02898       // Create frame index.
02899       int32_t Offset = VA.getLocMemOffset()+FPDiff;
02900       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
02901       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
02902       FIN = DAG.getFrameIndex(FI, getPointerTy());
02903 
02904       if (Flags.isByVal()) {
02905         // Copy relative to framepointer.
02906         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
02907         if (!StackPtr.getNode())
02908           StackPtr = DAG.getCopyFromReg(Chain, dl,
02909                                         RegInfo->getStackRegister(),
02910                                         getPointerTy());
02911         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
02912 
02913         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
02914                                                          ArgChain,
02915                                                          Flags, DAG, dl));
02916       } else {
02917         // Store relative to framepointer.
02918         MemOpChains2.push_back(
02919           DAG.getStore(ArgChain, dl, Arg, FIN,
02920                        MachinePointerInfo::getFixedStack(FI),
02921                        false, false, 0));
02922       }
02923     }
02924 
02925     if (!MemOpChains2.empty())
02926       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
02927 
02928     // Store the return address to the appropriate stack slot.
02929     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
02930                                      getPointerTy(), RegInfo->getSlotSize(),
02931                                      FPDiff, dl);
02932   }
02933 
02934   // Build a sequence of copy-to-reg nodes chained together with token chain
02935   // and flag operands which copy the outgoing args into registers.
02936   SDValue InFlag;
02937   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
02938     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
02939                              RegsToPass[i].second, InFlag);
02940     InFlag = Chain.getValue(1);
02941   }
02942 
02943   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
02944     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
02945     // In the 64-bit large code model, we have to make all calls
02946     // through a register, since the call instruction's 32-bit
02947     // pc-relative offset may not be large enough to hold the whole
02948     // address.
02949   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
02950     // If the callee is a GlobalAddress node (quite common, every direct call
02951     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
02952     // it.
02953     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
02954 
02955     // We should use extra load for direct calls to dllimported functions in
02956     // non-JIT mode.
02957     const GlobalValue *GV = G->getGlobal();
02958     if (!GV->hasDLLImportStorageClass()) {
02959       unsigned char OpFlags = 0;
02960       bool ExtraLoad = false;
02961       unsigned WrapperKind = ISD::DELETED_NODE;
02962 
02963       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
02964       // external symbols most go through the PLT in PIC mode.  If the symbol
02965       // has hidden or protected visibility, or if it is static or local, then
02966       // we don't need to use the PLT - we can directly call it.
02967       if (Subtarget->isTargetELF() &&
02968           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
02969           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
02970         OpFlags = X86II::MO_PLT;
02971       } else if (Subtarget->isPICStyleStubAny() &&
02972                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
02973                  (!Subtarget->getTargetTriple().isMacOSX() ||
02974                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
02975         // PC-relative references to external symbols should go through $stub,
02976         // unless we're building with the leopard linker or later, which
02977         // automatically synthesizes these stubs.
02978         OpFlags = X86II::MO_DARWIN_STUB;
02979       } else if (Subtarget->isPICStyleRIPRel() && isa<Function>(GV) &&
02980                  cast<Function>(GV)->hasFnAttribute(Attribute::NonLazyBind)) {
02981         // If the function is marked as non-lazy, generate an indirect call
02982         // which loads from the GOT directly. This avoids runtime overhead
02983         // at the cost of eager binding (and one extra byte of encoding).
02984         OpFlags = X86II::MO_GOTPCREL;
02985         WrapperKind = X86ISD::WrapperRIP;
02986         ExtraLoad = true;
02987       }
02988 
02989       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
02990                                           G->getOffset(), OpFlags);
02991 
02992       // Add a wrapper if needed.
02993       if (WrapperKind != ISD::DELETED_NODE)
02994         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
02995       // Add extra indirection if needed.
02996       if (ExtraLoad)
02997         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
02998                              MachinePointerInfo::getGOT(),
02999                              false, false, false, 0);
03000     }
03001   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03002     unsigned char OpFlags = 0;
03003 
03004     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03005     // external symbols should go through the PLT.
03006     if (Subtarget->isTargetELF() &&
03007         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03008       OpFlags = X86II::MO_PLT;
03009     } else if (Subtarget->isPICStyleStubAny() &&
03010                (!Subtarget->getTargetTriple().isMacOSX() ||
03011                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03012       // PC-relative references to external symbols should go through $stub,
03013       // unless we're building with the leopard linker or later, which
03014       // automatically synthesizes these stubs.
03015       OpFlags = X86II::MO_DARWIN_STUB;
03016     }
03017 
03018     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03019                                          OpFlags);
03020   } else if (Subtarget->isTarget64BitILP32() &&
03021              Callee->getValueType(0) == MVT::i32) {
03022     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03023     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03024   }
03025 
03026   // Returns a chain & a flag for retval copy to use.
03027   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03028   SmallVector<SDValue, 8> Ops;
03029 
03030   if (!IsSibcall && isTailCall) {
03031     Chain = DAG.getCALLSEQ_END(Chain,
03032                                DAG.getIntPtrConstant(NumBytesToPop, true),
03033                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03034     InFlag = Chain.getValue(1);
03035   }
03036 
03037   Ops.push_back(Chain);
03038   Ops.push_back(Callee);
03039 
03040   if (isTailCall)
03041     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03042 
03043   // Add argument registers to the end of the list so that they are known live
03044   // into the call.
03045   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03046     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03047                                   RegsToPass[i].second.getValueType()));
03048 
03049   // Add a register mask operand representing the call-preserved registers.
03050   const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
03051   const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
03052   assert(Mask && "Missing call preserved mask for calling convention");
03053   Ops.push_back(DAG.getRegisterMask(Mask));
03054 
03055   if (InFlag.getNode())
03056     Ops.push_back(InFlag);
03057 
03058   if (isTailCall) {
03059     // We used to do:
03060     //// If this is the first return lowered for this function, add the regs
03061     //// to the liveout set for the function.
03062     // This isn't right, although it's probably harmless on x86; liveouts
03063     // should be computed from returns not tail calls.  Consider a void
03064     // function making a tail call to a function returning int.
03065     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03066   }
03067 
03068   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03069   InFlag = Chain.getValue(1);
03070 
03071   // Create the CALLSEQ_END node.
03072   unsigned NumBytesForCalleeToPop;
03073   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03074                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03075     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03076   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03077            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03078            SR == StackStructReturn)
03079     // If this is a call to a struct-return function, the callee
03080     // pops the hidden struct pointer, so we have to push it back.
03081     // This is common for Darwin/X86, Linux & Mingw32 targets.
03082     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03083     NumBytesForCalleeToPop = 4;
03084   else
03085     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03086 
03087   // Returns a flag for retval copy to use.
03088   if (!IsSibcall) {
03089     Chain = DAG.getCALLSEQ_END(Chain,
03090                                DAG.getIntPtrConstant(NumBytesToPop, true),
03091                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03092                                                      true),
03093                                InFlag, dl);
03094     InFlag = Chain.getValue(1);
03095   }
03096 
03097   // Handle result values, copying them out of physregs into vregs that we
03098   // return.
03099   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03100                          Ins, dl, DAG, InVals);
03101 }
03102 
03103 //===----------------------------------------------------------------------===//
03104 //                Fast Calling Convention (tail call) implementation
03105 //===----------------------------------------------------------------------===//
03106 
03107 //  Like std call, callee cleans arguments, convention except that ECX is
03108 //  reserved for storing the tail called function address. Only 2 registers are
03109 //  free for argument passing (inreg). Tail call optimization is performed
03110 //  provided:
03111 //                * tailcallopt is enabled
03112 //                * caller/callee are fastcc
03113 //  On X86_64 architecture with GOT-style position independent code only local
03114 //  (within module) calls are supported at the moment.
03115 //  To keep the stack aligned according to platform abi the function
03116 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03117 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03118 //  If a tail called function callee has more arguments than the caller the
03119 //  caller needs to make sure that there is room to move the RETADDR to. This is
03120 //  achieved by reserving an area the size of the argument delta right after the
03121 //  original RETADDR, but before the saved framepointer or the spilled registers
03122 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03123 //  stack layout:
03124 //    arg1
03125 //    arg2
03126 //    RETADDR
03127 //    [ new RETADDR
03128 //      move area ]
03129 //    (possible EBP)
03130 //    ESI
03131 //    EDI
03132 //    local1 ..
03133 
03134 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03135 /// for a 16 byte align requirement.
03136 unsigned
03137 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03138                                                SelectionDAG& DAG) const {
03139   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03140   const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
03141   unsigned StackAlignment = TFI.getStackAlignment();
03142   uint64_t AlignMask = StackAlignment - 1;
03143   int64_t Offset = StackSize;
03144   unsigned SlotSize = RegInfo->getSlotSize();
03145   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03146     // Number smaller than 12 so just add the difference.
03147     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03148   } else {
03149     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03150     Offset = ((~AlignMask) & Offset) + StackAlignment +
03151       (StackAlignment-SlotSize);
03152   }
03153   return Offset;
03154 }
03155 
03156 /// MatchingStackOffset - Return true if the given stack call argument is
03157 /// already available in the same position (relatively) of the caller's
03158 /// incoming argument stack.
03159 static
03160 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03161                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03162                          const X86InstrInfo *TII) {
03163   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03164   int FI = INT_MAX;
03165   if (Arg.getOpcode() == ISD::CopyFromReg) {
03166     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03167     if (!TargetRegisterInfo::isVirtualRegister(VR))
03168       return false;
03169     MachineInstr *Def = MRI->getVRegDef(VR);
03170     if (!Def)
03171       return false;
03172     if (!Flags.isByVal()) {
03173       if (!TII->isLoadFromStackSlot(Def, FI))
03174         return false;
03175     } else {
03176       unsigned Opcode = Def->getOpcode();
03177       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
03178            Opcode == X86::LEA64_32r) &&
03179           Def->getOperand(1).isFI()) {
03180         FI = Def->getOperand(1).getIndex();
03181         Bytes = Flags.getByValSize();
03182       } else
03183         return false;
03184     }
03185   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03186     if (Flags.isByVal())
03187       // ByVal argument is passed in as a pointer but it's now being
03188       // dereferenced. e.g.
03189       // define @foo(%struct.X* %A) {
03190       //   tail call @bar(%struct.X* byval %A)
03191       // }
03192       return false;
03193     SDValue Ptr = Ld->getBasePtr();
03194     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03195     if (!FINode)
03196       return false;
03197     FI = FINode->getIndex();
03198   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03199     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03200     FI = FINode->getIndex();
03201     Bytes = Flags.getByValSize();
03202   } else
03203     return false;
03204 
03205   assert(FI != INT_MAX);
03206   if (!MFI->isFixedObjectIndex(FI))
03207     return false;
03208   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03209 }
03210 
03211 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03212 /// for tail call optimization. Targets which want to do tail call
03213 /// optimization should implement this function.
03214 bool
03215 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03216                                                      CallingConv::ID CalleeCC,
03217                                                      bool isVarArg,
03218                                                      bool isCalleeStructRet,
03219                                                      bool isCallerStructRet,
03220                                                      Type *RetTy,
03221                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03222                                     const SmallVectorImpl<SDValue> &OutVals,
03223                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03224                                                      SelectionDAG &DAG) const {
03225   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03226     return false;
03227 
03228   // If -tailcallopt is specified, make fastcc functions tail-callable.
03229   const MachineFunction &MF = DAG.getMachineFunction();
03230   const Function *CallerF = MF.getFunction();
03231 
03232   // If the function return type is x86_fp80 and the callee return type is not,
03233   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03234   // perform a tailcall optimization here.
03235   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03236     return false;
03237 
03238   CallingConv::ID CallerCC = CallerF->getCallingConv();
03239   bool CCMatch = CallerCC == CalleeCC;
03240   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03241   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03242 
03243   // Win64 functions have extra shadow space for argument homing. Don't do the
03244   // sibcall if the caller and callee have mismatched expectations for this
03245   // space.
03246   if (IsCalleeWin64 != IsCallerWin64)
03247     return false;
03248 
03249   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03250     if (IsTailCallConvention(CalleeCC) && CCMatch)
03251       return true;
03252     return false;
03253   }
03254 
03255   // Look for obvious safe cases to perform tail call optimization that do not
03256   // require ABI changes. This is what gcc calls sibcall.
03257 
03258   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03259   // emit a special epilogue.
03260   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03261   if (RegInfo->needsStackRealignment(MF))
03262     return false;
03263 
03264   // Also avoid sibcall optimization if either caller or callee uses struct
03265   // return semantics.
03266   if (isCalleeStructRet || isCallerStructRet)
03267     return false;
03268 
03269   // An stdcall/thiscall caller is expected to clean up its arguments; the
03270   // callee isn't going to do that.
03271   // FIXME: this is more restrictive than needed. We could produce a tailcall
03272   // when the stack adjustment matches. For example, with a thiscall that takes
03273   // only one argument.
03274   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03275                    CallerCC == CallingConv::X86_ThisCall))
03276     return false;
03277 
03278   // Do not sibcall optimize vararg calls unless all arguments are passed via
03279   // registers.
03280   if (isVarArg && !Outs.empty()) {
03281 
03282     // Optimizing for varargs on Win64 is unlikely to be safe without
03283     // additional testing.
03284     if (IsCalleeWin64 || IsCallerWin64)
03285       return false;
03286 
03287     SmallVector<CCValAssign, 16> ArgLocs;
03288     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03289                    *DAG.getContext());
03290 
03291     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03292     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03293       if (!ArgLocs[i].isRegLoc())
03294         return false;
03295   }
03296 
03297   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03298   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03299   // this into a sibcall.
03300   bool Unused = false;
03301   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03302     if (!Ins[i].Used) {
03303       Unused = true;
03304       break;
03305     }
03306   }
03307   if (Unused) {
03308     SmallVector<CCValAssign, 16> RVLocs;
03309     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03310                    *DAG.getContext());
03311     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03312     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03313       CCValAssign &VA = RVLocs[i];
03314       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03315         return false;
03316     }
03317   }
03318 
03319   // If the calling conventions do not match, then we'd better make sure the
03320   // results are returned in the same way as what the caller expects.
03321   if (!CCMatch) {
03322     SmallVector<CCValAssign, 16> RVLocs1;
03323     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03324                     *DAG.getContext());
03325     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03326 
03327     SmallVector<CCValAssign, 16> RVLocs2;
03328     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03329                     *DAG.getContext());
03330     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03331 
03332     if (RVLocs1.size() != RVLocs2.size())
03333       return false;
03334     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03335       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03336         return false;
03337       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03338         return false;
03339       if (RVLocs1[i].isRegLoc()) {
03340         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03341           return false;
03342       } else {
03343         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03344           return false;
03345       }
03346     }
03347   }
03348 
03349   // If the callee takes no arguments then go on to check the results of the
03350   // call.
03351   if (!Outs.empty()) {
03352     // Check if stack adjustment is needed. For now, do not do this if any
03353     // argument is passed on the stack.
03354     SmallVector<CCValAssign, 16> ArgLocs;
03355     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03356                    *DAG.getContext());
03357 
03358     // Allocate shadow area for Win64
03359     if (IsCalleeWin64)
03360       CCInfo.AllocateStack(32, 8);
03361 
03362     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03363     if (CCInfo.getNextStackOffset()) {
03364       MachineFunction &MF = DAG.getMachineFunction();
03365       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03366         return false;
03367 
03368       // Check if the arguments are already laid out in the right way as
03369       // the caller's fixed stack objects.
03370       MachineFrameInfo *MFI = MF.getFrameInfo();
03371       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03372       const X86InstrInfo *TII = Subtarget->getInstrInfo();
03373       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03374         CCValAssign &VA = ArgLocs[i];
03375         SDValue Arg = OutVals[i];
03376         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03377         if (VA.getLocInfo() == CCValAssign::Indirect)
03378           return false;
03379         if (!VA.isRegLoc()) {
03380           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03381                                    MFI, MRI, TII))
03382             return false;
03383         }
03384       }
03385     }
03386 
03387     // If the tailcall address may be in a register, then make sure it's
03388     // possible to register allocate for it. In 32-bit, the call address can
03389     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03390     // callee-saved registers are restored. These happen to be the same
03391     // registers used to pass 'inreg' arguments so watch out for those.
03392     if (!Subtarget->is64Bit() &&
03393         ((!isa<GlobalAddressSDNode>(Callee) &&
03394           !isa<ExternalSymbolSDNode>(Callee)) ||
03395          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03396       unsigned NumInRegs = 0;
03397       // In PIC we need an extra register to formulate the address computation
03398       // for the callee.
03399       unsigned MaxInRegs =
03400         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03401 
03402       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03403         CCValAssign &VA = ArgLocs[i];
03404         if (!VA.isRegLoc())
03405           continue;
03406         unsigned Reg = VA.getLocReg();
03407         switch (Reg) {
03408         default: break;
03409         case X86::EAX: case X86::EDX: case X86::ECX:
03410           if (++NumInRegs == MaxInRegs)
03411             return false;
03412           break;
03413         }
03414       }
03415     }
03416   }
03417 
03418   return true;
03419 }
03420 
03421 FastISel *
03422 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03423                                   const TargetLibraryInfo *libInfo) const {
03424   return X86::createFastISel(funcInfo, libInfo);
03425 }
03426 
03427 //===----------------------------------------------------------------------===//
03428 //                           Other Lowering Hooks
03429 //===----------------------------------------------------------------------===//
03430 
03431 static bool MayFoldLoad(SDValue Op) {
03432   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03433 }
03434 
03435 static bool MayFoldIntoStore(SDValue Op) {
03436   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03437 }
03438 
03439 static bool isTargetShuffle(unsigned Opcode) {
03440   switch(Opcode) {
03441   default: return false;
03442   case X86ISD::BLENDI:
03443   case X86ISD::PSHUFB:
03444   case X86ISD::PSHUFD:
03445   case X86ISD::PSHUFHW:
03446   case X86ISD::PSHUFLW:
03447   case X86ISD::SHUFP:
03448   case X86ISD::PALIGNR:
03449   case X86ISD::MOVLHPS:
03450   case X86ISD::MOVLHPD:
03451   case X86ISD::MOVHLPS:
03452   case X86ISD::MOVLPS:
03453   case X86ISD::MOVLPD:
03454   case X86ISD::MOVSHDUP:
03455   case X86ISD::MOVSLDUP:
03456   case X86ISD::MOVDDUP:
03457   case X86ISD::MOVSS:
03458   case X86ISD::MOVSD:
03459   case X86ISD::UNPCKL:
03460   case X86ISD::UNPCKH:
03461   case X86ISD::VPERMILPI:
03462   case X86ISD::VPERM2X128:
03463   case X86ISD::VPERMI:
03464     return true;
03465   }
03466 }
03467 
03468 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03469                                     SDValue V1, unsigned TargetMask,
03470                                     SelectionDAG &DAG) {
03471   switch(Opc) {
03472   default: llvm_unreachable("Unknown x86 shuffle node");
03473   case X86ISD::PSHUFD:
03474   case X86ISD::PSHUFHW:
03475   case X86ISD::PSHUFLW:
03476   case X86ISD::VPERMILPI:
03477   case X86ISD::VPERMI:
03478     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03479   }
03480 }
03481 
03482 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03483                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03484   switch(Opc) {
03485   default: llvm_unreachable("Unknown x86 shuffle node");
03486   case X86ISD::MOVLHPS:
03487   case X86ISD::MOVLHPD:
03488   case X86ISD::MOVHLPS:
03489   case X86ISD::MOVLPS:
03490   case X86ISD::MOVLPD:
03491   case X86ISD::MOVSS:
03492   case X86ISD::MOVSD:
03493   case X86ISD::UNPCKL:
03494   case X86ISD::UNPCKH:
03495     return DAG.getNode(Opc, dl, VT, V1, V2);
03496   }
03497 }
03498 
03499 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03500   MachineFunction &MF = DAG.getMachineFunction();
03501   const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
03502   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03503   int ReturnAddrIndex = FuncInfo->getRAIndex();
03504 
03505   if (ReturnAddrIndex == 0) {
03506     // Set up a frame object for the return address.
03507     unsigned SlotSize = RegInfo->getSlotSize();
03508     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03509                                                            -(int64_t)SlotSize,
03510                                                            false);
03511     FuncInfo->setRAIndex(ReturnAddrIndex);
03512   }
03513 
03514   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03515 }
03516 
03517 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03518                                        bool hasSymbolicDisplacement) {
03519   // Offset should fit into 32 bit immediate field.
03520   if (!isInt<32>(Offset))
03521     return false;
03522 
03523   // If we don't have a symbolic displacement - we don't have any extra
03524   // restrictions.
03525   if (!hasSymbolicDisplacement)
03526     return true;
03527 
03528   // FIXME: Some tweaks might be needed for medium code model.
03529   if (M != CodeModel::Small && M != CodeModel::Kernel)
03530     return false;
03531 
03532   // For small code model we assume that latest object is 16MB before end of 31
03533   // bits boundary. We may also accept pretty large negative constants knowing
03534   // that all objects are in the positive half of address space.
03535   if (M == CodeModel::Small && Offset < 16*1024*1024)
03536     return true;
03537 
03538   // For kernel code model we know that all object resist in the negative half
03539   // of 32bits address space. We may not accept negative offsets, since they may
03540   // be just off and we may accept pretty large positive ones.
03541   if (M == CodeModel::Kernel && Offset >= 0)
03542     return true;
03543 
03544   return false;
03545 }
03546 
03547 /// isCalleePop - Determines whether the callee is required to pop its
03548 /// own arguments. Callee pop is necessary to support tail calls.
03549 bool X86::isCalleePop(CallingConv::ID CallingConv,
03550                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03551   switch (CallingConv) {
03552   default:
03553     return false;
03554   case CallingConv::X86_StdCall:
03555   case CallingConv::X86_FastCall:
03556   case CallingConv::X86_ThisCall:
03557     return !is64Bit;
03558   case CallingConv::Fast:
03559   case CallingConv::GHC:
03560   case CallingConv::HiPE:
03561     if (IsVarArg)
03562       return false;
03563     return TailCallOpt;
03564   }
03565 }
03566 
03567 /// \brief Return true if the condition is an unsigned comparison operation.
03568 static bool isX86CCUnsigned(unsigned X86CC) {
03569   switch (X86CC) {
03570   default: llvm_unreachable("Invalid integer condition!");
03571   case X86::COND_E:     return true;
03572   case X86::COND_G:     return false;
03573   case X86::COND_GE:    return false;
03574   case X86::COND_L:     return false;
03575   case X86::COND_LE:    return false;
03576   case X86::COND_NE:    return true;
03577   case X86::COND_B:     return true;
03578   case X86::COND_A:     return true;
03579   case X86::COND_BE:    return true;
03580   case X86::COND_AE:    return true;
03581   }
03582   llvm_unreachable("covered switch fell through?!");
03583 }
03584 
03585 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03586 /// specific condition code, returning the condition code and the LHS/RHS of the
03587 /// comparison to make.
03588 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03589                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03590   if (!isFP) {
03591     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03592       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03593         // X > -1   -> X == 0, jump !sign.
03594         RHS = DAG.getConstant(0, RHS.getValueType());
03595         return X86::COND_NS;
03596       }
03597       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03598         // X < 0   -> X == 0, jump on sign.
03599         return X86::COND_S;
03600       }
03601       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03602         // X < 1   -> X <= 0
03603         RHS = DAG.getConstant(0, RHS.getValueType());
03604         return X86::COND_LE;
03605       }
03606     }
03607 
03608     switch (SetCCOpcode) {
03609     default: llvm_unreachable("Invalid integer condition!");
03610     case ISD::SETEQ:  return X86::COND_E;
03611     case ISD::SETGT:  return X86::COND_G;
03612     case ISD::SETGE:  return X86::COND_GE;
03613     case ISD::SETLT:  return X86::COND_L;
03614     case ISD::SETLE:  return X86::COND_LE;
03615     case ISD::SETNE:  return X86::COND_NE;
03616     case ISD::SETULT: return X86::COND_B;
03617     case ISD::SETUGT: return X86::COND_A;
03618     case ISD::SETULE: return X86::COND_BE;
03619     case ISD::SETUGE: return X86::COND_AE;
03620     }
03621   }
03622 
03623   // First determine if it is required or is profitable to flip the operands.
03624 
03625   // If LHS is a foldable load, but RHS is not, flip the condition.
03626   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03627       !ISD::isNON_EXTLoad(RHS.getNode())) {
03628     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03629     std::swap(LHS, RHS);
03630   }
03631 
03632   switch (SetCCOpcode) {
03633   default: break;
03634   case ISD::SETOLT:
03635   case ISD::SETOLE:
03636   case ISD::SETUGT:
03637   case ISD::SETUGE:
03638     std::swap(LHS, RHS);
03639     break;
03640   }
03641 
03642   // On a floating point condition, the flags are set as follows:
03643   // ZF  PF  CF   op
03644   //  0 | 0 | 0 | X > Y
03645   //  0 | 0 | 1 | X < Y
03646   //  1 | 0 | 0 | X == Y
03647   //  1 | 1 | 1 | unordered
03648   switch (SetCCOpcode) {
03649   default: llvm_unreachable("Condcode should be pre-legalized away");
03650   case ISD::SETUEQ:
03651   case ISD::SETEQ:   return X86::COND_E;
03652   case ISD::SETOLT:              // flipped
03653   case ISD::SETOGT:
03654   case ISD::SETGT:   return X86::COND_A;
03655   case ISD::SETOLE:              // flipped
03656   case ISD::SETOGE:
03657   case ISD::SETGE:   return X86::COND_AE;
03658   case ISD::SETUGT:              // flipped
03659   case ISD::SETULT:
03660   case ISD::SETLT:   return X86::COND_B;
03661   case ISD::SETUGE:              // flipped
03662   case ISD::SETULE:
03663   case ISD::SETLE:   return X86::COND_BE;
03664   case ISD::SETONE:
03665   case ISD::SETNE:   return X86::COND_NE;
03666   case ISD::SETUO:   return X86::COND_P;
03667   case ISD::SETO:    return X86::COND_NP;
03668   case ISD::SETOEQ:
03669   case ISD::SETUNE:  return X86::COND_INVALID;
03670   }
03671 }
03672 
03673 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03674 /// code. Current x86 isa includes the following FP cmov instructions:
03675 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03676 static bool hasFPCMov(unsigned X86CC) {
03677   switch (X86CC) {
03678   default:
03679     return false;
03680   case X86::COND_B:
03681   case X86::COND_BE:
03682   case X86::COND_E:
03683   case X86::COND_P:
03684   case X86::COND_A:
03685   case X86::COND_AE:
03686   case X86::COND_NE:
03687   case X86::COND_NP:
03688     return true;
03689   }
03690 }
03691 
03692 /// isFPImmLegal - Returns true if the target can instruction select the
03693 /// specified FP immediate natively. If false, the legalizer will
03694 /// materialize the FP immediate as a load from a constant pool.
03695 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03696   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03697     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03698       return true;
03699   }
03700   return false;
03701 }
03702 
03703 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
03704                                               ISD::LoadExtType ExtTy,
03705                                               EVT NewVT) const {
03706   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
03707   // relocation target a movq or addq instruction: don't let the load shrink.
03708   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
03709   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
03710     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
03711       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
03712   return true;
03713 }
03714 
03715 /// \brief Returns true if it is beneficial to convert a load of a constant
03716 /// to just the constant itself.
03717 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03718                                                           Type *Ty) const {
03719   assert(Ty->isIntegerTy());
03720 
03721   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03722   if (BitSize == 0 || BitSize > 64)
03723     return false;
03724   return true;
03725 }
03726 
03727 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
03728                                                 unsigned Index) const {
03729   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
03730     return false;
03731 
03732   return (Index == 0 || Index == ResVT.getVectorNumElements());
03733 }
03734 
03735 bool X86TargetLowering::isCheapToSpeculateCttz() const {
03736   // Speculate cttz only if we can directly use TZCNT.
03737   return Subtarget->hasBMI();
03738 }
03739 
03740 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
03741   // Speculate ctlz only if we can directly use LZCNT.
03742   return Subtarget->hasLZCNT();
03743 }
03744 
03745 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03746 /// the specified range (L, H].
03747 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03748   return (Val < 0) || (Val >= Low && Val < Hi);
03749 }
03750 
03751 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03752 /// specified value.
03753 static bool isUndefOrEqual(int Val, int CmpVal) {
03754   return (Val < 0 || Val == CmpVal);
03755 }
03756 
03757 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03758 /// from position Pos and ending in Pos+Size, falls within the specified
03759 /// sequential range (Low, Low+Size]. or is undef.
03760 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03761                                        unsigned Pos, unsigned Size, int Low) {
03762   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03763     if (!isUndefOrEqual(Mask[i], Low))
03764       return false;
03765   return true;
03766 }
03767 
03768 /// isVEXTRACTIndex - Return true if the specified
03769 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
03770 /// suitable for instruction that extract 128 or 256 bit vectors
03771 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
03772   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03773   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03774     return false;
03775 
03776   // The index should be aligned on a vecWidth-bit boundary.
03777   uint64_t Index =
03778     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03779 
03780   MVT VT = N->getSimpleValueType(0);
03781   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03782   bool Result = (Index * ElSize) % vecWidth == 0;
03783 
03784   return Result;
03785 }
03786 
03787 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
03788 /// operand specifies a subvector insert that is suitable for input to
03789 /// insertion of 128 or 256-bit subvectors
03790 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
03791   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
03792   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03793     return false;
03794   // The index should be aligned on a vecWidth-bit boundary.
03795   uint64_t Index =
03796     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03797 
03798   MVT VT = N->getSimpleValueType(0);
03799   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
03800   bool Result = (Index * ElSize) % vecWidth == 0;
03801 
03802   return Result;
03803 }
03804 
03805 bool X86::isVINSERT128Index(SDNode *N) {
03806   return isVINSERTIndex(N, 128);
03807 }
03808 
03809 bool X86::isVINSERT256Index(SDNode *N) {
03810   return isVINSERTIndex(N, 256);
03811 }
03812 
03813 bool X86::isVEXTRACT128Index(SDNode *N) {
03814   return isVEXTRACTIndex(N, 128);
03815 }
03816 
03817 bool X86::isVEXTRACT256Index(SDNode *N) {
03818   return isVEXTRACTIndex(N, 256);
03819 }
03820 
03821 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
03822   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03823   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
03824     llvm_unreachable("Illegal extract subvector for VEXTRACT");
03825 
03826   uint64_t Index =
03827     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
03828 
03829   MVT VecVT = N->getOperand(0).getSimpleValueType();
03830   MVT ElVT = VecVT.getVectorElementType();
03831 
03832   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
03833   return Index / NumElemsPerChunk;
03834 }
03835 
03836 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
03837   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
03838   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
03839     llvm_unreachable("Illegal insert subvector for VINSERT");
03840 
03841   uint64_t Index =
03842     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
03843 
03844   MVT VecVT = N->getSimpleValueType(0);
03845   MVT ElVT = VecVT.getVectorElementType();
03846 
03847   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
03848   return Index / NumElemsPerChunk;
03849 }
03850 
03851 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
03852 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
03853 /// and VINSERTI128 instructions.
03854 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
03855   return getExtractVEXTRACTImmediate(N, 128);
03856 }
03857 
03858 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
03859 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
03860 /// and VINSERTI64x4 instructions.
03861 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
03862   return getExtractVEXTRACTImmediate(N, 256);
03863 }
03864 
03865 /// getInsertVINSERT128Immediate - Return the appropriate immediate
03866 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
03867 /// and VINSERTI128 instructions.
03868 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
03869   return getInsertVINSERTImmediate(N, 128);
03870 }
03871 
03872 /// getInsertVINSERT256Immediate - Return the appropriate immediate
03873 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
03874 /// and VINSERTI64x4 instructions.
03875 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
03876   return getInsertVINSERTImmediate(N, 256);
03877 }
03878 
03879 /// isZero - Returns true if Elt is a constant integer zero
03880 static bool isZero(SDValue V) {
03881   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
03882   return C && C->isNullValue();
03883 }
03884 
03885 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
03886 /// constant +0.0.
03887 bool X86::isZeroNode(SDValue Elt) {
03888   if (isZero(Elt))
03889     return true;
03890   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
03891     return CFP->getValueAPF().isPosZero();
03892   return false;
03893 }
03894 
03895 /// getZeroVector - Returns a vector of specified type with all zero elements.
03896 ///
03897 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
03898                              SelectionDAG &DAG, SDLoc dl) {
03899   assert(VT.isVector() && "Expected a vector type");
03900 
03901   // Always build SSE zero vectors as <4 x i32> bitcasted
03902   // to their dest type. This ensures they get CSE'd.
03903   SDValue Vec;
03904   if (VT.is128BitVector()) {  // SSE
03905     if (Subtarget->hasSSE2()) {  // SSE2
03906       SDValue Cst = DAG.getConstant(0, MVT::i32);
03907       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
03908     } else { // SSE1
03909       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
03910       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
03911     }
03912   } else if (VT.is256BitVector()) { // AVX
03913     if (Subtarget->hasInt256()) { // AVX2
03914       SDValue Cst = DAG.getConstant(0, MVT::i32);
03915       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03916       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
03917     } else {
03918       // 256-bit logic and arithmetic instructions in AVX are all
03919       // floating-point, no support for integer ops. Emit fp zeroed vectors.
03920       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
03921       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03922       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
03923     }
03924   } else if (VT.is512BitVector()) { // AVX-512
03925       SDValue Cst = DAG.getConstant(0, MVT::i32);
03926       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
03927                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
03928       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
03929   } else if (VT.getScalarType() == MVT::i1) {
03930 
03931     assert((Subtarget->hasBWI() || VT.getVectorNumElements() <= 16)
03932             && "Unexpected vector type");
03933     assert((Subtarget->hasVLX() || VT.getVectorNumElements() >= 8)
03934             && "Unexpected vector type");
03935     SDValue Cst = DAG.getConstant(0, MVT::i1);
03936     SmallVector<SDValue, 64> Ops(VT.getVectorNumElements(), Cst);
03937     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
03938   } else
03939     llvm_unreachable("Unexpected vector type");
03940 
03941   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
03942 }
03943 
03944 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
03945                                 SelectionDAG &DAG, SDLoc dl,
03946                                 unsigned vectorWidth) {
03947   assert((vectorWidth == 128 || vectorWidth == 256) &&
03948          "Unsupported vector width");
03949   EVT VT = Vec.getValueType();
03950   EVT ElVT = VT.getVectorElementType();
03951   unsigned Factor = VT.getSizeInBits()/vectorWidth;
03952   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
03953                                   VT.getVectorNumElements()/Factor);
03954 
03955   // Extract from UNDEF is UNDEF.
03956   if (Vec.getOpcode() == ISD::UNDEF)
03957     return DAG.getUNDEF(ResultVT);
03958 
03959   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
03960   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
03961 
03962   // This is the index of the first element of the vectorWidth-bit chunk
03963   // we want.
03964   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
03965                                * ElemsPerChunk);
03966 
03967   // If the input is a buildvector just emit a smaller one.
03968   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
03969     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
03970                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
03971                                     ElemsPerChunk));
03972 
03973   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
03974   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
03975 }
03976 
03977 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
03978 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
03979 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
03980 /// instructions or a simple subregister reference. Idx is an index in the
03981 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
03982 /// lowering EXTRACT_VECTOR_ELT operations easier.
03983 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
03984                                    SelectionDAG &DAG, SDLoc dl) {
03985   assert((Vec.getValueType().is256BitVector() ||
03986           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
03987   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
03988 }
03989 
03990 /// Generate a DAG to grab 256-bits from a 512-bit vector.
03991 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
03992                                    SelectionDAG &DAG, SDLoc dl) {
03993   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
03994   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
03995 }
03996 
03997 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
03998                                unsigned IdxVal, SelectionDAG &DAG,
03999                                SDLoc dl, unsigned vectorWidth) {
04000   assert((vectorWidth == 128 || vectorWidth == 256) &&
04001          "Unsupported vector width");
04002   // Inserting UNDEF is Result
04003   if (Vec.getOpcode() == ISD::UNDEF)
04004     return Result;
04005   EVT VT = Vec.getValueType();
04006   EVT ElVT = VT.getVectorElementType();
04007   EVT ResultVT = Result.getValueType();
04008 
04009   // Insert the relevant vectorWidth bits.
04010   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
04011 
04012   // This is the index of the first element of the vectorWidth-bit chunk
04013   // we want.
04014   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
04015                                * ElemsPerChunk);
04016 
04017   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
04018   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
04019 }
04020 
04021 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
04022 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
04023 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
04024 /// simple superregister reference.  Idx is an index in the 128 bits
04025 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
04026 /// lowering INSERT_VECTOR_ELT operations easier.
04027 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
04028                                   SelectionDAG &DAG, SDLoc dl) {
04029   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
04030 
04031   // For insertion into the zero index (low half) of a 256-bit vector, it is
04032   // more efficient to generate a blend with immediate instead of an insert*128.
04033   // We are still creating an INSERT_SUBVECTOR below with an undef node to
04034   // extend the subvector to the size of the result vector. Make sure that
04035   // we are not recursing on that node by checking for undef here.
04036   if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
04037       Result.getOpcode() != ISD::UNDEF) {
04038     EVT ResultVT = Result.getValueType();
04039     SDValue ZeroIndex = DAG.getIntPtrConstant(0);
04040     SDValue Undef = DAG.getUNDEF(ResultVT);
04041     SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
04042                                  Vec, ZeroIndex);
04043 
04044     // The blend instruction, and therefore its mask, depend on the data type.
04045     MVT ScalarType = ResultVT.getScalarType().getSimpleVT();
04046     if (ScalarType.isFloatingPoint()) {
04047       // Choose either vblendps (float) or vblendpd (double).
04048       unsigned ScalarSize = ScalarType.getSizeInBits();
04049       assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
04050       unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
04051       SDValue Mask = DAG.getConstant(MaskVal, MVT::i8);
04052       return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
04053     }
04054 
04055     const X86Subtarget &Subtarget =
04056     static_cast<const X86Subtarget &>(DAG.getSubtarget());
04057 
04058     // AVX2 is needed for 256-bit integer blend support.
04059     // Integers must be cast to 32-bit because there is only vpblendd;
04060     // vpblendw can't be used for this because it has a handicapped mask.
04061 
04062     // If we don't have AVX2, then cast to float. Using a wrong domain blend
04063     // is still more efficient than using the wrong domain vinsertf128 that
04064     // will be created by InsertSubVector().
04065     MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
04066 
04067     SDValue Mask = DAG.getConstant(0x0f, MVT::i8);
04068     Vec256 = DAG.getNode(ISD::BITCAST, dl, CastVT, Vec256);
04069     Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
04070     return DAG.getNode(ISD::BITCAST, dl, ResultVT, Vec256);
04071   }
04072 
04073   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
04074 }
04075 
04076 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
04077                                   SelectionDAG &DAG, SDLoc dl) {
04078   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
04079   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
04080 }
04081 
04082 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
04083 /// instructions. This is used because creating CONCAT_VECTOR nodes of
04084 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
04085 /// large BUILD_VECTORS.
04086 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
04087                                    unsigned NumElems, SelectionDAG &DAG,
04088                                    SDLoc dl) {
04089   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
04090   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
04091 }
04092 
04093 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
04094                                    unsigned NumElems, SelectionDAG &DAG,
04095                                    SDLoc dl) {
04096   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
04097   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
04098 }
04099 
04100 /// getOnesVector - Returns a vector of specified type with all bits set.
04101 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
04102 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
04103 /// Then bitcast to their original type, ensuring they get CSE'd.
04104 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
04105                              SDLoc dl) {
04106   assert(VT.isVector() && "Expected a vector type");
04107 
04108   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
04109   SDValue Vec;
04110   if (VT.is256BitVector()) {
04111     if (HasInt256) { // AVX2
04112       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
04113       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
04114     } else { // AVX
04115       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04116       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
04117     }
04118   } else if (VT.is128BitVector()) {
04119     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
04120   } else
04121     llvm_unreachable("Unexpected vector type");
04122 
04123   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
04124 }
04125 
04126 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
04127 /// operation of specified width.
04128 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
04129                        SDValue V2) {
04130   unsigned NumElems = VT.getVectorNumElements();
04131   SmallVector<int, 8> Mask;
04132   Mask.push_back(NumElems);
04133   for (unsigned i = 1; i != NumElems; ++i)
04134     Mask.push_back(i);
04135   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04136 }
04137 
04138 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
04139 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04140                           SDValue V2) {
04141   unsigned NumElems = VT.getVectorNumElements();
04142   SmallVector<int, 8> Mask;
04143   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
04144     Mask.push_back(i);
04145     Mask.push_back(i + NumElems);
04146   }
04147   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04148 }
04149 
04150 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
04151 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
04152                           SDValue V2) {
04153   unsigned NumElems = VT.getVectorNumElements();
04154   SmallVector<int, 8> Mask;
04155   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
04156     Mask.push_back(i + Half);
04157     Mask.push_back(i + NumElems + Half);
04158   }
04159   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
04160 }
04161 
04162 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
04163 /// vector of zero or undef vector.  This produces a shuffle where the low
04164 /// element of V2 is swizzled into the zero/undef vector, landing at element
04165 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
04166 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
04167                                            bool IsZero,
04168                                            const X86Subtarget *Subtarget,
04169                                            SelectionDAG &DAG) {
04170   MVT VT = V2.getSimpleValueType();
04171   SDValue V1 = IsZero
04172     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
04173   unsigned NumElems = VT.getVectorNumElements();
04174   SmallVector<int, 16> MaskVec;
04175   for (unsigned i = 0; i != NumElems; ++i)
04176     // If this is the insertion idx, put the low elt of V2 here.
04177     MaskVec.push_back(i == Idx ? NumElems : i);
04178   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
04179 }
04180 
04181 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
04182 /// target specific opcode. Returns true if the Mask could be calculated. Sets
04183 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
04184 /// shuffles which use a single input multiple times, and in those cases it will
04185 /// adjust the mask to only have indices within that single input.
04186 static bool getTargetShuffleMask(SDNode *N, MVT VT,
04187                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
04188   unsigned NumElems = VT.getVectorNumElements();
04189   SDValue ImmN;
04190 
04191   IsUnary = false;
04192   bool IsFakeUnary = false;
04193   switch(N->getOpcode()) {
04194   case X86ISD::BLENDI:
04195     ImmN = N->getOperand(N->getNumOperands()-1);
04196     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04197     break;
04198   case X86ISD::SHUFP:
04199     ImmN = N->getOperand(N->getNumOperands()-1);
04200     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04201     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04202     break;
04203   case X86ISD::UNPCKH:
04204     DecodeUNPCKHMask(VT, Mask);
04205     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04206     break;
04207   case X86ISD::UNPCKL:
04208     DecodeUNPCKLMask(VT, Mask);
04209     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04210     break;
04211   case X86ISD::MOVHLPS:
04212     DecodeMOVHLPSMask(NumElems, Mask);
04213     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04214     break;
04215   case X86ISD::MOVLHPS:
04216     DecodeMOVLHPSMask(NumElems, Mask);
04217     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
04218     break;
04219   case X86ISD::PALIGNR:
04220     ImmN = N->getOperand(N->getNumOperands()-1);
04221     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04222     break;
04223   case X86ISD::PSHUFD:
04224   case X86ISD::VPERMILPI:
04225     ImmN = N->getOperand(N->getNumOperands()-1);
04226     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04227     IsUnary = true;
04228     break;
04229   case X86ISD::PSHUFHW:
04230     ImmN = N->getOperand(N->getNumOperands()-1);
04231     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04232     IsUnary = true;
04233     break;
04234   case X86ISD::PSHUFLW:
04235     ImmN = N->getOperand(N->getNumOperands()-1);
04236     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04237     IsUnary = true;
04238     break;
04239   case X86ISD::PSHUFB: {
04240     IsUnary = true;
04241     SDValue MaskNode = N->getOperand(1);
04242     while (MaskNode->getOpcode() == ISD::BITCAST)
04243       MaskNode = MaskNode->getOperand(0);
04244 
04245     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
04246       // If we have a build-vector, then things are easy.
04247       EVT VT = MaskNode.getValueType();
04248       assert(VT.isVector() &&
04249              "Can't produce a non-vector with a build_vector!");
04250       if (!VT.isInteger())
04251         return false;
04252 
04253       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
04254 
04255       SmallVector<uint64_t, 32> RawMask;
04256       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
04257         SDValue Op = MaskNode->getOperand(i);
04258         if (Op->getOpcode() == ISD::UNDEF) {
04259           RawMask.push_back((uint64_t)SM_SentinelUndef);
04260           continue;
04261         }
04262         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
04263         if (!CN)
04264           return false;
04265         APInt MaskElement = CN->getAPIntValue();
04266 
04267         // We now have to decode the element which could be any integer size and
04268         // extract each byte of it.
04269         for (int j = 0; j < NumBytesPerElement; ++j) {
04270           // Note that this is x86 and so always little endian: the low byte is
04271           // the first byte of the mask.
04272           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
04273           MaskElement = MaskElement.lshr(8);
04274         }
04275       }
04276       DecodePSHUFBMask(RawMask, Mask);
04277       break;
04278     }
04279 
04280     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
04281     if (!MaskLoad)
04282       return false;
04283 
04284     SDValue Ptr = MaskLoad->getBasePtr();
04285     if (Ptr->getOpcode() == X86ISD::Wrapper)
04286       Ptr = Ptr->getOperand(0);
04287 
04288     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
04289     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
04290       return false;
04291 
04292     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
04293       DecodePSHUFBMask(C, Mask);
04294       if (Mask.empty())
04295         return false;
04296       break;
04297     }
04298 
04299     return false;
04300   }
04301   case X86ISD::VPERMI:
04302     ImmN = N->getOperand(N->getNumOperands()-1);
04303     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04304     IsUnary = true;
04305     break;
04306   case X86ISD::MOVSS:
04307   case X86ISD::MOVSD:
04308     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
04309     break;
04310   case X86ISD::VPERM2X128:
04311     ImmN = N->getOperand(N->getNumOperands()-1);
04312     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
04313     if (Mask.empty()) return false;
04314     break;
04315   case X86ISD::MOVSLDUP:
04316     DecodeMOVSLDUPMask(VT, Mask);
04317     IsUnary = true;
04318     break;
04319   case X86ISD::MOVSHDUP:
04320     DecodeMOVSHDUPMask(VT, Mask);
04321     IsUnary = true;
04322     break;
04323   case X86ISD::MOVDDUP:
04324     DecodeMOVDDUPMask(VT, Mask);
04325     IsUnary = true;
04326     break;
04327   case X86ISD::MOVLHPD:
04328   case X86ISD::MOVLPD:
04329   case X86ISD::MOVLPS:
04330     // Not yet implemented
04331     return false;
04332   default: llvm_unreachable("unknown target shuffle node");
04333   }
04334 
04335   // If we have a fake unary shuffle, the shuffle mask is spread across two
04336   // inputs that are actually the same node. Re-map the mask to always point
04337   // into the first input.
04338   if (IsFakeUnary)
04339     for (int &M : Mask)
04340       if (M >= (int)Mask.size())
04341         M -= Mask.size();
04342 
04343   return true;
04344 }
04345 
04346 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
04347 /// element of the result of the vector shuffle.
04348 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
04349                                    unsigned Depth) {
04350   if (Depth == 6)
04351     return SDValue();  // Limit search depth.
04352 
04353   SDValue V = SDValue(N, 0);
04354   EVT VT = V.getValueType();
04355   unsigned Opcode = V.getOpcode();
04356 
04357   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
04358   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
04359     int Elt = SV->getMaskElt(Index);
04360 
04361     if (Elt < 0)
04362       return DAG.getUNDEF(VT.getVectorElementType());
04363 
04364     unsigned NumElems = VT.getVectorNumElements();
04365     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
04366                                          : SV->getOperand(1);
04367     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
04368   }
04369 
04370   // Recurse into target specific vector shuffles to find scalars.
04371   if (isTargetShuffle(Opcode)) {
04372     MVT ShufVT = V.getSimpleValueType();
04373     unsigned NumElems = ShufVT.getVectorNumElements();
04374     SmallVector<int, 16> ShuffleMask;
04375     bool IsUnary;
04376 
04377     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
04378       return SDValue();
04379 
04380     int Elt = ShuffleMask[Index];
04381     if (Elt < 0)
04382       return DAG.getUNDEF(ShufVT.getVectorElementType());
04383 
04384     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
04385                                          : N->getOperand(1);
04386     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
04387                                Depth+1);
04388   }
04389 
04390   // Actual nodes that may contain scalar elements
04391   if (Opcode == ISD::BITCAST) {
04392     V = V.getOperand(0);
04393     EVT SrcVT = V.getValueType();
04394     unsigned NumElems = VT.getVectorNumElements();
04395 
04396     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
04397       return SDValue();
04398   }
04399 
04400   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
04401     return (Index == 0) ? V.getOperand(0)
04402                         : DAG.getUNDEF(VT.getVectorElementType());
04403 
04404   if (V.getOpcode() == ISD::BUILD_VECTOR)
04405     return V.getOperand(Index);
04406 
04407   return SDValue();
04408 }
04409 
04410 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
04411 ///
04412 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
04413                                        unsigned NumNonZero, unsigned NumZero,
04414                                        SelectionDAG &DAG,
04415                                        const X86Subtarget* Subtarget,
04416                                        const TargetLowering &TLI) {
04417   if (NumNonZero > 8)
04418     return SDValue();
04419 
04420   SDLoc dl(Op);
04421   SDValue V;
04422   bool First = true;
04423   for (unsigned i = 0; i < 16; ++i) {
04424     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
04425     if (ThisIsNonZero && First) {
04426       if (NumZero)
04427         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04428       else
04429         V = DAG.getUNDEF(MVT::v8i16);
04430       First = false;
04431     }
04432 
04433     if ((i & 1) != 0) {
04434       SDValue ThisElt, LastElt;
04435       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
04436       if (LastIsNonZero) {
04437         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
04438                               MVT::i16, Op.getOperand(i-1));
04439       }
04440       if (ThisIsNonZero) {
04441         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
04442         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
04443                               ThisElt, DAG.getConstant(8, MVT::i8));
04444         if (LastIsNonZero)
04445           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
04446       } else
04447         ThisElt = LastElt;
04448 
04449       if (ThisElt.getNode())
04450         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
04451                         DAG.getIntPtrConstant(i/2));
04452     }
04453   }
04454 
04455   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
04456 }
04457 
04458 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
04459 ///
04460 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
04461                                      unsigned NumNonZero, unsigned NumZero,
04462                                      SelectionDAG &DAG,
04463                                      const X86Subtarget* Subtarget,
04464                                      const TargetLowering &TLI) {
04465   if (NumNonZero > 4)
04466     return SDValue();
04467 
04468   SDLoc dl(Op);
04469   SDValue V;
04470   bool First = true;
04471   for (unsigned i = 0; i < 8; ++i) {
04472     bool isNonZero = (NonZeros & (1 << i)) != 0;
04473     if (isNonZero) {
04474       if (First) {
04475         if (NumZero)
04476           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
04477         else
04478           V = DAG.getUNDEF(MVT::v8i16);
04479         First = false;
04480       }
04481       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
04482                       MVT::v8i16, V, Op.getOperand(i),
04483                       DAG.getIntPtrConstant(i));
04484     }
04485   }
04486 
04487   return V;
04488 }
04489 
04490 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
04491 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
04492                                      const X86Subtarget *Subtarget,
04493                                      const TargetLowering &TLI) {
04494   // Find all zeroable elements.
04495   std::bitset<4> Zeroable;
04496   for (int i=0; i < 4; ++i) {
04497     SDValue Elt = Op->getOperand(i);
04498     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
04499   }
04500   assert(Zeroable.size() - Zeroable.count() > 1 &&
04501          "We expect at least two non-zero elements!");
04502 
04503   // We only know how to deal with build_vector nodes where elements are either
04504   // zeroable or extract_vector_elt with constant index.
04505   SDValue FirstNonZero;
04506   unsigned FirstNonZeroIdx;
04507   for (unsigned i=0; i < 4; ++i) {
04508     if (Zeroable[i])
04509       continue;
04510     SDValue Elt = Op->getOperand(i);
04511     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
04512         !isa<ConstantSDNode>(Elt.getOperand(1)))
04513       return SDValue();
04514     // Make sure that this node is extracting from a 128-bit vector.
04515     MVT VT = Elt.getOperand(0).getSimpleValueType();
04516     if (!VT.is128BitVector())
04517       return SDValue();
04518     if (!FirstNonZero.getNode()) {
04519       FirstNonZero = Elt;
04520       FirstNonZeroIdx = i;
04521     }
04522   }
04523 
04524   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
04525   SDValue V1 = FirstNonZero.getOperand(0);
04526   MVT VT = V1.getSimpleValueType();
04527 
04528   // See if this build_vector can be lowered as a blend with zero.
04529   SDValue Elt;
04530   unsigned EltMaskIdx, EltIdx;
04531   int Mask[4];
04532   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
04533     if (Zeroable[EltIdx]) {
04534       // The zero vector will be on the right hand side.
04535       Mask[EltIdx] = EltIdx+4;
04536       continue;
04537     }
04538 
04539     Elt = Op->getOperand(EltIdx);
04540     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
04541     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
04542     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
04543       break;
04544     Mask[EltIdx] = EltIdx;
04545   }
04546 
04547   if (EltIdx == 4) {
04548     // Let the shuffle legalizer deal with blend operations.
04549     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
04550     if (V1.getSimpleValueType() != VT)
04551       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
04552     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
04553   }
04554 
04555   // See if we can lower this build_vector to a INSERTPS.
04556   if (!Subtarget->hasSSE41())
04557     return SDValue();
04558 
04559   SDValue V2 = Elt.getOperand(0);
04560   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
04561     V1 = SDValue();
04562 
04563   bool CanFold = true;
04564   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
04565     if (Zeroable[i])
04566       continue;
04567 
04568     SDValue Current = Op->getOperand(i);
04569     SDValue SrcVector = Current->getOperand(0);
04570     if (!V1.getNode())
04571       V1 = SrcVector;
04572     CanFold = SrcVector == V1 &&
04573       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
04574   }
04575 
04576   if (!CanFold)
04577     return SDValue();
04578 
04579   assert(V1.getNode() && "Expected at least two non-zero elements!");
04580   if (V1.getSimpleValueType() != MVT::v4f32)
04581     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
04582   if (V2.getSimpleValueType() != MVT::v4f32)
04583     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
04584 
04585   // Ok, we can emit an INSERTPS instruction.
04586   unsigned ZMask = Zeroable.to_ulong();
04587 
04588   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
04589   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
04590   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
04591                                DAG.getIntPtrConstant(InsertPSMask));
04592   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
04593 }
04594 
04595 /// Return a vector logical shift node.
04596 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
04597                          unsigned NumBits, SelectionDAG &DAG,
04598                          const TargetLowering &TLI, SDLoc dl) {
04599   assert(VT.is128BitVector() && "Unknown type for VShift");
04600   MVT ShVT = MVT::v2i64;
04601   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
04602   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
04603   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
04604   assert(NumBits % 8 == 0 && "Only support byte sized shifts");
04605   SDValue ShiftVal = DAG.getConstant(NumBits/8, ScalarShiftTy);
04606   return DAG.getNode(ISD::BITCAST, dl, VT,
04607                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
04608 }
04609 
04610 static SDValue
04611 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
04612 
04613   // Check if the scalar load can be widened into a vector load. And if
04614   // the address is "base + cst" see if the cst can be "absorbed" into
04615   // the shuffle mask.
04616   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
04617     SDValue Ptr = LD->getBasePtr();
04618     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
04619       return SDValue();
04620     EVT PVT = LD->getValueType(0);
04621     if (PVT != MVT::i32 && PVT != MVT::f32)
04622       return SDValue();
04623 
04624     int FI = -1;
04625     int64_t Offset = 0;
04626     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
04627       FI = FINode->getIndex();
04628       Offset = 0;
04629     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
04630                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
04631       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
04632       Offset = Ptr.getConstantOperandVal(1);
04633       Ptr = Ptr.getOperand(0);
04634     } else {
04635       return SDValue();
04636     }
04637 
04638     // FIXME: 256-bit vector instructions don't require a strict alignment,
04639     // improve this code to support it better.
04640     unsigned RequiredAlign = VT.getSizeInBits()/8;
04641     SDValue Chain = LD->getChain();
04642     // Make sure the stack object alignment is at least 16 or 32.
04643     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
04644     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
04645       if (MFI->isFixedObjectIndex(FI)) {
04646         // Can't change the alignment. FIXME: It's possible to compute
04647         // the exact stack offset and reference FI + adjust offset instead.
04648         // If someone *really* cares about this. That's the way to implement it.
04649         return SDValue();
04650       } else {
04651         MFI->setObjectAlignment(FI, RequiredAlign);
04652       }
04653     }
04654 
04655     // (Offset % 16 or 32) must be multiple of 4. Then address is then
04656     // Ptr + (Offset & ~15).
04657     if (Offset < 0)
04658       return SDValue();
04659     if ((Offset % RequiredAlign) & 3)
04660       return SDValue();
04661     int64_t StartOffset = Offset & ~(RequiredAlign-1);
04662     if (StartOffset)
04663       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
04664                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
04665 
04666     int EltNo = (Offset - StartOffset) >> 2;
04667     unsigned NumElems = VT.getVectorNumElements();
04668 
04669     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
04670     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
04671                              LD->getPointerInfo().getWithOffset(StartOffset),
04672                              false, false, false, 0);
04673 
04674     SmallVector<int, 8> Mask(NumElems, EltNo);
04675 
04676     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
04677   }
04678 
04679   return SDValue();
04680 }
04681 
04682 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
04683 /// elements can be replaced by a single large load which has the same value as
04684 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
04685 ///
04686 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
04687 ///
04688 /// FIXME: we'd also like to handle the case where the last elements are zero
04689 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
04690 /// There's even a handy isZeroNode for that purpose.
04691 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
04692                                         SDLoc &DL, SelectionDAG &DAG,
04693                                         bool isAfterLegalize) {
04694   unsigned NumElems = Elts.size();
04695 
04696   LoadSDNode *LDBase = nullptr;
04697   unsigned LastLoadedElt = -1U;
04698 
04699   // For each element in the initializer, see if we've found a load or an undef.
04700   // If we don't find an initial load element, or later load elements are
04701   // non-consecutive, bail out.
04702   for (unsigned i = 0; i < NumElems; ++i) {
04703     SDValue Elt = Elts[i];
04704     // Look through a bitcast.
04705     if (Elt.getNode() && Elt.getOpcode() == ISD::BITCAST)
04706       Elt = Elt.getOperand(0);
04707     if (!Elt.getNode() ||
04708         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
04709       return SDValue();
04710     if (!LDBase) {
04711       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
04712         return SDValue();
04713       LDBase = cast<LoadSDNode>(Elt.getNode());
04714       LastLoadedElt = i;
04715       continue;
04716     }
04717     if (Elt.getOpcode() == ISD::UNDEF)
04718       continue;
04719 
04720     LoadSDNode *LD = cast<LoadSDNode>(Elt);
04721     EVT LdVT = Elt.getValueType();
04722     // Each loaded element must be the correct fractional portion of the
04723     // requested vector load.
04724     if (LdVT.getSizeInBits() != VT.getSizeInBits() / NumElems)
04725       return SDValue();
04726     if (!DAG.isConsecutiveLoad(LD, LDBase, LdVT.getSizeInBits() / 8, i))
04727       return SDValue();
04728     LastLoadedElt = i;
04729   }
04730 
04731   // If we have found an entire vector of loads and undefs, then return a large
04732   // load of the entire vector width starting at the base pointer.  If we found
04733   // consecutive loads for the low half, generate a vzext_load node.
04734   if (LastLoadedElt == NumElems - 1) {
04735     assert(LDBase && "Did not find base load for merging consecutive loads");
04736     EVT EltVT = LDBase->getValueType(0);
04737     // Ensure that the input vector size for the merged loads matches the
04738     // cumulative size of the input elements.
04739     if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
04740       return SDValue();
04741 
04742     if (isAfterLegalize &&
04743         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
04744       return SDValue();
04745 
04746     SDValue NewLd = SDValue();
04747 
04748     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
04749                         LDBase->getPointerInfo(), LDBase->isVolatile(),
04750                         LDBase->isNonTemporal(), LDBase->isInvariant(),
04751                         LDBase->getAlignment());
04752 
04753     if (LDBase->hasAnyUseOfValue(1)) {
04754       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04755                                      SDValue(LDBase, 1),
04756                                      SDValue(NewLd.getNode(), 1));
04757       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04758       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04759                              SDValue(NewLd.getNode(), 1));
04760     }
04761 
04762     return NewLd;
04763   }
04764 
04765   //TODO: The code below fires only for for loading the low v2i32 / v2f32
04766   //of a v4i32 / v4f32. It's probably worth generalizing.
04767   EVT EltVT = VT.getVectorElementType();
04768   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
04769       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
04770     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
04771     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
04772     SDValue ResNode =
04773         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
04774                                 LDBase->getPointerInfo(),
04775                                 LDBase->getAlignment(),
04776                                 false/*isVolatile*/, true/*ReadMem*/,
04777                                 false/*WriteMem*/);
04778 
04779     // Make sure the newly-created LOAD is in the same position as LDBase in
04780     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
04781     // update uses of LDBase's output chain to use the TokenFactor.
04782     if (LDBase->hasAnyUseOfValue(1)) {
04783       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
04784                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
04785       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
04786       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
04787                              SDValue(ResNode.getNode(), 1));
04788     }
04789 
04790     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
04791   }
04792   return SDValue();
04793 }
04794 
04795 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
04796 /// to generate a splat value for the following cases:
04797 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
04798 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
04799 /// a scalar load, or a constant.
04800 /// The VBROADCAST node is returned when a pattern is found,
04801 /// or SDValue() otherwise.
04802 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
04803                                     SelectionDAG &DAG) {
04804   // VBROADCAST requires AVX.
04805   // TODO: Splats could be generated for non-AVX CPUs using SSE
04806   // instructions, but there's less potential gain for only 128-bit vectors.
04807   if (!Subtarget->hasAVX())
04808     return SDValue();
04809 
04810   MVT VT = Op.getSimpleValueType();
04811   SDLoc dl(Op);
04812 
04813   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
04814          "Unsupported vector type for broadcast.");
04815 
04816   SDValue Ld;
04817   bool ConstSplatVal;
04818 
04819   switch (Op.getOpcode()) {
04820     default:
04821       // Unknown pattern found.
04822       return SDValue();
04823 
04824     case ISD::BUILD_VECTOR: {
04825       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
04826       BitVector UndefElements;
04827       SDValue Splat = BVOp->getSplatValue(&UndefElements);
04828 
04829       // We need a splat of a single value to use broadcast, and it doesn't
04830       // make any sense if the value is only in one element of the vector.
04831       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
04832         return SDValue();
04833 
04834       Ld = Splat;
04835       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04836                        Ld.getOpcode() == ISD::ConstantFP);
04837 
04838       // Make sure that all of the users of a non-constant load are from the
04839       // BUILD_VECTOR node.
04840       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
04841         return SDValue();
04842       break;
04843     }
04844 
04845     case ISD::VECTOR_SHUFFLE: {
04846       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
04847 
04848       // Shuffles must have a splat mask where the first element is
04849       // broadcasted.
04850       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
04851         return SDValue();
04852 
04853       SDValue Sc = Op.getOperand(0);
04854       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
04855           Sc.getOpcode() != ISD::BUILD_VECTOR) {
04856 
04857         if (!Subtarget->hasInt256())
04858           return SDValue();
04859 
04860         // Use the register form of the broadcast instruction available on AVX2.
04861         if (VT.getSizeInBits() >= 256)
04862           Sc = Extract128BitVector(Sc, 0, DAG, dl);
04863         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
04864       }
04865 
04866       Ld = Sc.getOperand(0);
04867       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
04868                        Ld.getOpcode() == ISD::ConstantFP);
04869 
04870       // The scalar_to_vector node and the suspected
04871       // load node must have exactly one user.
04872       // Constants may have multiple users.
04873 
04874       // AVX-512 has register version of the broadcast
04875       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
04876         Ld.getValueType().getSizeInBits() >= 32;
04877       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
04878           !hasRegVer))
04879         return SDValue();
04880       break;
04881     }
04882   }
04883 
04884   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
04885   bool IsGE256 = (VT.getSizeInBits() >= 256);
04886 
04887   // When optimizing for size, generate up to 5 extra bytes for a broadcast
04888   // instruction to save 8 or more bytes of constant pool data.
04889   // TODO: If multiple splats are generated to load the same constant,
04890   // it may be detrimental to overall size. There needs to be a way to detect
04891   // that condition to know if this is truly a size win.
04892   const Function *F = DAG.getMachineFunction().getFunction();
04893   bool OptForSize = F->hasFnAttribute(Attribute::OptimizeForSize);
04894 
04895   // Handle broadcasting a single constant scalar from the constant pool
04896   // into a vector.
04897   // On Sandybridge (no AVX2), it is still better to load a constant vector
04898   // from the constant pool and not to broadcast it from a scalar.
04899   // But override that restriction when optimizing for size.
04900   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
04901   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
04902     EVT CVT = Ld.getValueType();
04903     assert(!CVT.isVector() && "Must not broadcast a vector type");
04904 
04905     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
04906     // For size optimization, also splat v2f64 and v2i64, and for size opt
04907     // with AVX2, also splat i8 and i16.
04908     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
04909     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
04910         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
04911       const Constant *C = nullptr;
04912       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
04913         C = CI->getConstantIntValue();
04914       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
04915         C = CF->getConstantFPValue();
04916 
04917       assert(C && "Invalid constant type");
04918 
04919       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
04920       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
04921       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
04922       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
04923                        MachinePointerInfo::getConstantPool(),
04924                        false, false, false, Alignment);
04925 
04926       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04927     }
04928   }
04929 
04930   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
04931 
04932   // Handle AVX2 in-register broadcasts.
04933   if (!IsLoad && Subtarget->hasInt256() &&
04934       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
04935     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04936 
04937   // The scalar source must be a normal load.
04938   if (!IsLoad)
04939     return SDValue();
04940 
04941   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
04942       (Subtarget->hasVLX() && ScalarSize == 64))
04943     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04944 
04945   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
04946   // double since there is no vbroadcastsd xmm
04947   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
04948     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
04949       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
04950   }
04951 
04952   // Unsupported broadcast.
04953   return SDValue();
04954 }
04955 
04956 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
04957 /// underlying vector and index.
04958 ///
04959 /// Modifies \p ExtractedFromVec to the real vector and returns the real
04960 /// index.
04961 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
04962                                          SDValue ExtIdx) {
04963   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
04964   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
04965     return Idx;
04966 
04967   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
04968   // lowered this:
04969   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
04970   // to:
04971   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
04972   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
04973   //                           undef)
04974   //                       Constant<0>)
04975   // In this case the vector is the extract_subvector expression and the index
04976   // is 2, as specified by the shuffle.
04977   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
04978   SDValue ShuffleVec = SVOp->getOperand(0);
04979   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
04980   assert(ShuffleVecVT.getVectorElementType() ==
04981          ExtractedFromVec.getSimpleValueType().getVectorElementType());
04982 
04983   int ShuffleIdx = SVOp->getMaskElt(Idx);
04984   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
04985     ExtractedFromVec = ShuffleVec;
04986     return ShuffleIdx;
04987   }
04988   return Idx;
04989 }
04990 
04991 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
04992   MVT VT = Op.getSimpleValueType();
04993 
04994   // Skip if insert_vec_elt is not supported.
04995   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
04996   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
04997     return SDValue();
04998 
04999   SDLoc DL(Op);
05000   unsigned NumElems = Op.getNumOperands();
05001 
05002   SDValue VecIn1;
05003   SDValue VecIn2;
05004   SmallVector<unsigned, 4> InsertIndices;
05005   SmallVector<int, 8> Mask(NumElems, -1);
05006 
05007   for (unsigned i = 0; i != NumElems; ++i) {
05008     unsigned Opc = Op.getOperand(i).getOpcode();
05009 
05010     if (Opc == ISD::UNDEF)
05011       continue;
05012 
05013     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
05014       // Quit if more than 1 elements need inserting.
05015       if (InsertIndices.size() > 1)
05016         return SDValue();
05017 
05018       InsertIndices.push_back(i);
05019       continue;
05020     }
05021 
05022     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
05023     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
05024     // Quit if non-constant index.
05025     if (!isa<ConstantSDNode>(ExtIdx))
05026       return SDValue();
05027     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
05028 
05029     // Quit if extracted from vector of different type.
05030     if (ExtractedFromVec.getValueType() != VT)
05031       return SDValue();
05032 
05033     if (!VecIn1.getNode())
05034       VecIn1 = ExtractedFromVec;
05035     else if (VecIn1 != ExtractedFromVec) {
05036       if (!VecIn2.getNode())
05037         VecIn2 = ExtractedFromVec;
05038       else if (VecIn2 != ExtractedFromVec)
05039         // Quit if more than 2 vectors to shuffle
05040         return SDValue();
05041     }
05042 
05043     if (ExtractedFromVec == VecIn1)
05044       Mask[i] = Idx;
05045     else if (ExtractedFromVec == VecIn2)
05046       Mask[i] = Idx + NumElems;
05047   }
05048 
05049   if (!VecIn1.getNode())
05050     return SDValue();
05051 
05052   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
05053   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
05054   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
05055     unsigned Idx = InsertIndices[i];
05056     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
05057                      DAG.getIntPtrConstant(Idx));
05058   }
05059 
05060   return NV;
05061 }
05062 
05063 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
05064 SDValue
05065 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
05066 
05067   MVT VT = Op.getSimpleValueType();
05068   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
05069          "Unexpected type in LowerBUILD_VECTORvXi1!");
05070 
05071   SDLoc dl(Op);
05072   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05073     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
05074     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05075     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05076   }
05077 
05078   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
05079     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
05080     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05081     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05082   }
05083 
05084   bool AllContants = true;
05085   uint64_t Immediate = 0;
05086   int NonConstIdx = -1;
05087   bool IsSplat = true;
05088   unsigned NumNonConsts = 0;
05089   unsigned NumConsts = 0;
05090   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
05091     SDValue In = Op.getOperand(idx);
05092     if (In.getOpcode() == ISD::UNDEF)
05093       continue;
05094     if (!isa<ConstantSDNode>(In)) {
05095       AllContants = false;
05096       NonConstIdx = idx;
05097       NumNonConsts++;
05098     } else {
05099       NumConsts++;
05100       if (cast<ConstantSDNode>(In)->getZExtValue())
05101       Immediate |= (1ULL << idx);
05102     }
05103     if (In != Op.getOperand(0))
05104       IsSplat = false;
05105   }
05106 
05107   if (AllContants) {
05108     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
05109       DAG.getConstant(Immediate, MVT::i16));
05110     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
05111                        DAG.getIntPtrConstant(0));
05112   }
05113 
05114   if (NumNonConsts == 1 && NonConstIdx != 0) {
05115     SDValue DstVec;
05116     if (NumConsts) {
05117       SDValue VecAsImm = DAG.getConstant(Immediate,
05118                                          MVT::getIntegerVT(VT.getSizeInBits()));
05119       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
05120     }
05121     else
05122       DstVec = DAG.getUNDEF(VT);
05123     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
05124                        Op.getOperand(NonConstIdx),
05125                        DAG.getIntPtrConstant(NonConstIdx));
05126   }
05127   if (!IsSplat && (NonConstIdx != 0))
05128     llvm_unreachable("Unsupported BUILD_VECTOR operation");
05129   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
05130   SDValue Select;
05131   if (IsSplat)
05132     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05133                           DAG.getConstant(-1, SelectVT),
05134                           DAG.getConstant(0, SelectVT));
05135   else
05136     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
05137                          DAG.getConstant((Immediate | 1), SelectVT),
05138                          DAG.getConstant(Immediate, SelectVT));
05139   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
05140 }
05141 
05142 /// \brief Return true if \p N implements a horizontal binop and return the
05143 /// operands for the horizontal binop into V0 and V1.
05144 ///
05145 /// This is a helper function of PerformBUILD_VECTORCombine.
05146 /// This function checks that the build_vector \p N in input implements a
05147 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
05148 /// operation to match.
05149 /// For example, if \p Opcode is equal to ISD::ADD, then this function
05150 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
05151 /// is equal to ISD::SUB, then this function checks if this is a horizontal
05152 /// arithmetic sub.
05153 ///
05154 /// This function only analyzes elements of \p N whose indices are
05155 /// in range [BaseIdx, LastIdx).
05156 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
05157                               SelectionDAG &DAG,
05158                               unsigned BaseIdx, unsigned LastIdx,
05159                               SDValue &V0, SDValue &V1) {
05160   EVT VT = N->getValueType(0);
05161 
05162   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
05163   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
05164          "Invalid Vector in input!");
05165 
05166   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
05167   bool CanFold = true;
05168   unsigned ExpectedVExtractIdx = BaseIdx;
05169   unsigned NumElts = LastIdx - BaseIdx;
05170   V0 = DAG.getUNDEF(VT);
05171   V1 = DAG.getUNDEF(VT);
05172 
05173   // Check if N implements a horizontal binop.
05174   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
05175     SDValue Op = N->getOperand(i + BaseIdx);
05176 
05177     // Skip UNDEFs.
05178     if (Op->getOpcode() == ISD::UNDEF) {
05179       // Update the expected vector extract index.
05180       if (i * 2 == NumElts)
05181         ExpectedVExtractIdx = BaseIdx;
05182       ExpectedVExtractIdx += 2;
05183       continue;
05184     }
05185 
05186     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
05187 
05188     if (!CanFold)
05189       break;
05190 
05191     SDValue Op0 = Op.getOperand(0);
05192     SDValue Op1 = Op.getOperand(1);
05193 
05194     // Try to match the following pattern:
05195     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
05196     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05197         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
05198         Op0.getOperand(0) == Op1.getOperand(0) &&
05199         isa<ConstantSDNode>(Op0.getOperand(1)) &&
05200         isa<ConstantSDNode>(Op1.getOperand(1)));
05201     if (!CanFold)
05202       break;
05203 
05204     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05205     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
05206 
05207     if (i * 2 < NumElts) {
05208       if (V0.getOpcode() == ISD::UNDEF)
05209         V0 = Op0.getOperand(0);
05210     } else {
05211       if (V1.getOpcode() == ISD::UNDEF)
05212         V1 = Op0.getOperand(0);
05213       if (i * 2 == NumElts)
05214         ExpectedVExtractIdx = BaseIdx;
05215     }
05216 
05217     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
05218     if (I0 == ExpectedVExtractIdx)
05219       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
05220     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
05221       // Try to match the following dag sequence:
05222       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
05223       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
05224     } else
05225       CanFold = false;
05226 
05227     ExpectedVExtractIdx += 2;
05228   }
05229 
05230   return CanFold;
05231 }
05232 
05233 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
05234 /// a concat_vector.
05235 ///
05236 /// This is a helper function of PerformBUILD_VECTORCombine.
05237 /// This function expects two 256-bit vectors called V0 and V1.
05238 /// At first, each vector is split into two separate 128-bit vectors.
05239 /// Then, the resulting 128-bit vectors are used to implement two
05240 /// horizontal binary operations.
05241 ///
05242 /// The kind of horizontal binary operation is defined by \p X86Opcode.
05243 ///
05244 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
05245 /// the two new horizontal binop.
05246 /// When Mode is set, the first horizontal binop dag node would take as input
05247 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
05248 /// horizontal binop dag node would take as input the lower 128-bit of V1
05249 /// and the upper 128-bit of V1.
05250 ///   Example:
05251 ///     HADD V0_LO, V0_HI
05252 ///     HADD V1_LO, V1_HI
05253 ///
05254 /// Otherwise, the first horizontal binop dag node takes as input the lower
05255 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
05256 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
05257 ///   Example:
05258 ///     HADD V0_LO, V1_LO
05259 ///     HADD V0_HI, V1_HI
05260 ///
05261 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
05262 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
05263 /// the upper 128-bits of the result.
05264 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
05265                                      SDLoc DL, SelectionDAG &DAG,
05266                                      unsigned X86Opcode, bool Mode,
05267                                      bool isUndefLO, bool isUndefHI) {
05268   EVT VT = V0.getValueType();
05269   assert(VT.is256BitVector() && VT == V1.getValueType() &&
05270          "Invalid nodes in input!");
05271 
05272   unsigned NumElts = VT.getVectorNumElements();
05273   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
05274   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
05275   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
05276   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
05277   EVT NewVT = V0_LO.getValueType();
05278 
05279   SDValue LO = DAG.getUNDEF(NewVT);
05280   SDValue HI = DAG.getUNDEF(NewVT);
05281 
05282   if (Mode) {
05283     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05284     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
05285       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
05286     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
05287       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
05288   } else {
05289     // Don't emit a horizontal binop if the result is expected to be UNDEF.
05290     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
05291                        V1_LO->getOpcode() != ISD::UNDEF))
05292       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
05293 
05294     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
05295                        V1_HI->getOpcode() != ISD::UNDEF))
05296       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
05297   }
05298 
05299   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
05300 }
05301 
05302 /// \brief Try to fold a build_vector that performs an 'addsub' into the
05303 /// sequence of 'vadd + vsub + blendi'.
05304 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
05305                            const X86Subtarget *Subtarget) {
05306   SDLoc DL(BV);
05307   EVT VT = BV->getValueType(0);
05308   unsigned NumElts = VT.getVectorNumElements();
05309   SDValue InVec0 = DAG.getUNDEF(VT);
05310   SDValue InVec1 = DAG.getUNDEF(VT);
05311 
05312   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
05313           VT == MVT::v2f64) && "build_vector with an invalid type found!");
05314 
05315   // Odd-numbered elements in the input build vector are obtained from
05316   // adding two integer/float elements.
05317   // Even-numbered elements in the input build vector are obtained from
05318   // subtracting two integer/float elements.
05319   unsigned ExpectedOpcode = ISD::FSUB;
05320   unsigned NextExpectedOpcode = ISD::FADD;
05321   bool AddFound = false;
05322   bool SubFound = false;
05323 
05324   for (unsigned i = 0, e = NumElts; i != e; ++i) {
05325     SDValue Op = BV->getOperand(i);
05326 
05327     // Skip 'undef' values.
05328     unsigned Opcode = Op.getOpcode();
05329     if (Opcode == ISD::UNDEF) {
05330       std::swap(ExpectedOpcode, NextExpectedOpcode);
05331       continue;
05332     }
05333 
05334     // Early exit if we found an unexpected opcode.
05335     if (Opcode != ExpectedOpcode)
05336       return SDValue();
05337 
05338     SDValue Op0 = Op.getOperand(0);
05339     SDValue Op1 = Op.getOperand(1);
05340 
05341     // Try to match the following pattern:
05342     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
05343     // Early exit if we cannot match that sequence.
05344     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05345         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05346         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
05347         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
05348         Op0.getOperand(1) != Op1.getOperand(1))
05349       return SDValue();
05350 
05351     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
05352     if (I0 != i)
05353       return SDValue();
05354 
05355     // We found a valid add/sub node. Update the information accordingly.
05356     if (i & 1)
05357       AddFound = true;
05358     else
05359       SubFound = true;
05360 
05361     // Update InVec0 and InVec1.
05362     if (InVec0.getOpcode() == ISD::UNDEF)
05363       InVec0 = Op0.getOperand(0);
05364     if (InVec1.getOpcode() == ISD::UNDEF)
05365       InVec1 = Op1.getOperand(0);
05366 
05367     // Make sure that operands in input to each add/sub node always
05368     // come from a same pair of vectors.
05369     if (InVec0 != Op0.getOperand(0)) {
05370       if (ExpectedOpcode == ISD::FSUB)
05371         return SDValue();
05372 
05373       // FADD is commutable. Try to commute the operands
05374       // and then test again.
05375       std::swap(Op0, Op1);
05376       if (InVec0 != Op0.getOperand(0))
05377         return SDValue();
05378     }
05379 
05380     if (InVec1 != Op1.getOperand(0))
05381       return SDValue();
05382 
05383     // Update the pair of expected opcodes.
05384     std::swap(ExpectedOpcode, NextExpectedOpcode);
05385   }
05386 
05387   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
05388   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
05389       InVec1.getOpcode() != ISD::UNDEF)
05390     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
05391 
05392   return SDValue();
05393 }
05394 
05395 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
05396                                           const X86Subtarget *Subtarget) {
05397   SDLoc DL(N);
05398   EVT VT = N->getValueType(0);
05399   unsigned NumElts = VT.getVectorNumElements();
05400   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
05401   SDValue InVec0, InVec1;
05402 
05403   // Try to match an ADDSUB.
05404   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
05405       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
05406     SDValue Value = matchAddSub(BV, DAG, Subtarget);
05407     if (Value.getNode())
05408       return Value;
05409   }
05410 
05411   // Try to match horizontal ADD/SUB.
05412   unsigned NumUndefsLO = 0;
05413   unsigned NumUndefsHI = 0;
05414   unsigned Half = NumElts/2;
05415 
05416   // Count the number of UNDEF operands in the build_vector in input.
05417   for (unsigned i = 0, e = Half; i != e; ++i)
05418     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05419       NumUndefsLO++;
05420 
05421   for (unsigned i = Half, e = NumElts; i != e; ++i)
05422     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
05423       NumUndefsHI++;
05424 
05425   // Early exit if this is either a build_vector of all UNDEFs or all the
05426   // operands but one are UNDEF.
05427   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
05428     return SDValue();
05429 
05430   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
05431     // Try to match an SSE3 float HADD/HSUB.
05432     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05433       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05434 
05435     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05436       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05437   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
05438     // Try to match an SSSE3 integer HADD/HSUB.
05439     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05440       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
05441 
05442     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05443       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
05444   }
05445 
05446   if (!Subtarget->hasAVX())
05447     return SDValue();
05448 
05449   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
05450     // Try to match an AVX horizontal add/sub of packed single/double
05451     // precision floating point values from 256-bit vectors.
05452     SDValue InVec2, InVec3;
05453     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
05454         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
05455         ((InVec0.getOpcode() == ISD::UNDEF ||
05456           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05457         ((InVec1.getOpcode() == ISD::UNDEF ||
05458           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05459       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
05460 
05461     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
05462         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
05463         ((InVec0.getOpcode() == ISD::UNDEF ||
05464           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05465         ((InVec1.getOpcode() == ISD::UNDEF ||
05466           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05467       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
05468   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
05469     // Try to match an AVX2 horizontal add/sub of signed integers.
05470     SDValue InVec2, InVec3;
05471     unsigned X86Opcode;
05472     bool CanFold = true;
05473 
05474     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
05475         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
05476         ((InVec0.getOpcode() == ISD::UNDEF ||
05477           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05478         ((InVec1.getOpcode() == ISD::UNDEF ||
05479           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05480       X86Opcode = X86ISD::HADD;
05481     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
05482         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
05483         ((InVec0.getOpcode() == ISD::UNDEF ||
05484           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
05485         ((InVec1.getOpcode() == ISD::UNDEF ||
05486           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
05487       X86Opcode = X86ISD::HSUB;
05488     else
05489       CanFold = false;
05490 
05491     if (CanFold) {
05492       // Fold this build_vector into a single horizontal add/sub.
05493       // Do this only if the target has AVX2.
05494       if (Subtarget->hasAVX2())
05495         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
05496 
05497       // Do not try to expand this build_vector into a pair of horizontal
05498       // add/sub if we can emit a pair of scalar add/sub.
05499       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05500         return SDValue();
05501 
05502       // Convert this build_vector into a pair of horizontal binop followed by
05503       // a concat vector.
05504       bool isUndefLO = NumUndefsLO == Half;
05505       bool isUndefHI = NumUndefsHI == Half;
05506       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
05507                                    isUndefLO, isUndefHI);
05508     }
05509   }
05510 
05511   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
05512        VT == MVT::v16i16) && Subtarget->hasAVX()) {
05513     unsigned X86Opcode;
05514     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
05515       X86Opcode = X86ISD::HADD;
05516     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
05517       X86Opcode = X86ISD::HSUB;
05518     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
05519       X86Opcode = X86ISD::FHADD;
05520     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
05521       X86Opcode = X86ISD::FHSUB;
05522     else
05523       return SDValue();
05524 
05525     // Don't try to expand this build_vector into a pair of horizontal add/sub
05526     // if we can simply emit a pair of scalar add/sub.
05527     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
05528       return SDValue();
05529 
05530     // Convert this build_vector into two horizontal add/sub followed by
05531     // a concat vector.
05532     bool isUndefLO = NumUndefsLO == Half;
05533     bool isUndefHI = NumUndefsHI == Half;
05534     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
05535                                  isUndefLO, isUndefHI);
05536   }
05537 
05538   return SDValue();
05539 }
05540 
05541 SDValue
05542 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
05543   SDLoc dl(Op);
05544 
05545   MVT VT = Op.getSimpleValueType();
05546   MVT ExtVT = VT.getVectorElementType();
05547   unsigned NumElems = Op.getNumOperands();
05548 
05549   // Generate vectors for predicate vectors.
05550   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
05551     return LowerBUILD_VECTORvXi1(Op, DAG);
05552 
05553   // Vectors containing all zeros can be matched by pxor and xorps later
05554   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
05555     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
05556     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
05557     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
05558       return Op;
05559 
05560     return getZeroVector(VT, Subtarget, DAG, dl);
05561   }
05562 
05563   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
05564   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
05565   // vpcmpeqd on 256-bit vectors.
05566   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
05567     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
05568       return Op;
05569 
05570     if (!VT.is512BitVector())
05571       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
05572   }
05573 
05574   if (SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG))
05575     return Broadcast;
05576 
05577   unsigned EVTBits = ExtVT.getSizeInBits();
05578 
05579   unsigned NumZero  = 0;
05580   unsigned NumNonZero = 0;
05581   unsigned NonZeros = 0;
05582   bool IsAllConstants = true;
05583   SmallSet<SDValue, 8> Values;
05584   for (unsigned i = 0; i < NumElems; ++i) {
05585     SDValue Elt = Op.getOperand(i);
05586     if (Elt.getOpcode() == ISD::UNDEF)
05587       continue;
05588     Values.insert(Elt);
05589     if (Elt.getOpcode() != ISD::Constant &&
05590         Elt.getOpcode() != ISD::ConstantFP)
05591       IsAllConstants = false;
05592     if (X86::isZeroNode(Elt))
05593       NumZero++;
05594     else {
05595       NonZeros |= (1 << i);
05596       NumNonZero++;
05597     }
05598   }
05599 
05600   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
05601   if (NumNonZero == 0)
05602     return DAG.getUNDEF(VT);
05603 
05604   // Special case for single non-zero, non-undef, element.
05605   if (NumNonZero == 1) {
05606     unsigned Idx = countTrailingZeros(NonZeros);
05607     SDValue Item = Op.getOperand(Idx);
05608 
05609     // If this is an insertion of an i64 value on x86-32, and if the top bits of
05610     // the value are obviously zero, truncate the value to i32 and do the
05611     // insertion that way.  Only do this if the value is non-constant or if the
05612     // value is a constant being inserted into element 0.  It is cheaper to do
05613     // a constant pool load than it is to do a movd + shuffle.
05614     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
05615         (!IsAllConstants || Idx == 0)) {
05616       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
05617         // Handle SSE only.
05618         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
05619         EVT VecVT = MVT::v4i32;
05620 
05621         // Truncate the value (which may itself be a constant) to i32, and
05622         // convert it to a vector with movd (S2V+shuffle to zero extend).
05623         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
05624         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
05625         return DAG.getNode(
05626             ISD::BITCAST, dl, VT,
05627             getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
05628       }
05629     }
05630 
05631     // If we have a constant or non-constant insertion into the low element of
05632     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
05633     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
05634     // depending on what the source datatype is.
05635     if (Idx == 0) {
05636       if (NumZero == 0)
05637         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05638 
05639       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
05640           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
05641         if (VT.is512BitVector()) {
05642           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
05643           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
05644                              Item, DAG.getIntPtrConstant(0));
05645         }
05646         assert((VT.is128BitVector() || VT.is256BitVector()) &&
05647                "Expected an SSE value type!");
05648         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05649         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
05650         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05651       }
05652 
05653       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
05654         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
05655         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
05656         if (VT.is256BitVector()) {
05657           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
05658           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
05659         } else {
05660           assert(VT.is128BitVector() && "Expected an SSE value type!");
05661           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
05662         }
05663         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
05664       }
05665     }
05666 
05667     // Is it a vector logical left shift?
05668     if (NumElems == 2 && Idx == 1 &&
05669         X86::isZeroNode(Op.getOperand(0)) &&
05670         !X86::isZeroNode(Op.getOperand(1))) {
05671       unsigned NumBits = VT.getSizeInBits();
05672       return getVShift(true, VT,
05673                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
05674                                    VT, Op.getOperand(1)),
05675                        NumBits/2, DAG, *this, dl);
05676     }
05677 
05678     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
05679       return SDValue();
05680 
05681     // Otherwise, if this is a vector with i32 or f32 elements, and the element
05682     // is a non-constant being inserted into an element other than the low one,
05683     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
05684     // movd/movss) to move this into the low element, then shuffle it into
05685     // place.
05686     if (EVTBits == 32) {
05687       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
05688       return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
05689     }
05690   }
05691 
05692   // Splat is obviously ok. Let legalizer expand it to a shuffle.
05693   if (Values.size() == 1) {
05694     if (EVTBits == 32) {
05695       // Instead of a shuffle like this:
05696       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
05697       // Check if it's possible to issue this instead.
05698       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
05699       unsigned Idx = countTrailingZeros(NonZeros);
05700       SDValue Item = Op.getOperand(Idx);
05701       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
05702         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
05703     }
05704     return SDValue();
05705   }
05706 
05707   // A vector full of immediates; various special cases are already
05708   // handled, so this is best done with a single constant-pool load.
05709   if (IsAllConstants)
05710     return SDValue();
05711 
05712   // For AVX-length vectors, see if we can use a vector load to get all of the
05713   // elements, otherwise build the individual 128-bit pieces and use
05714   // shuffles to put them in place.
05715   if (VT.is256BitVector() || VT.is512BitVector()) {
05716     SmallVector<SDValue, 64> V(Op->op_begin(), Op->op_begin() + NumElems);
05717 
05718     // Check for a build vector of consecutive loads.
05719     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
05720       return LD;
05721 
05722     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
05723 
05724     // Build both the lower and upper subvector.
05725     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05726                                 makeArrayRef(&V[0], NumElems/2));
05727     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
05728                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
05729 
05730     // Recreate the wider vector with the lower and upper part.
05731     if (VT.is256BitVector())
05732       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05733     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
05734   }
05735 
05736   // Let legalizer expand 2-wide build_vectors.
05737   if (EVTBits == 64) {
05738     if (NumNonZero == 1) {
05739       // One half is zero or undef.
05740       unsigned Idx = countTrailingZeros(NonZeros);
05741       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
05742                                  Op.getOperand(Idx));
05743       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
05744     }
05745     return SDValue();
05746   }
05747 
05748   // If element VT is < 32 bits, convert it to inserts into a zero vector.
05749   if (EVTBits == 8 && NumElems == 16)
05750     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
05751                                         Subtarget, *this))
05752       return V;
05753 
05754   if (EVTBits == 16 && NumElems == 8)
05755     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
05756                                       Subtarget, *this))
05757       return V;
05758 
05759   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
05760   if (EVTBits == 32 && NumElems == 4)
05761     if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
05762       return V;
05763 
05764   // If element VT is == 32 bits, turn it into a number of shuffles.
05765   SmallVector<SDValue, 8> V(NumElems);
05766   if (NumElems == 4 && NumZero > 0) {
05767     for (unsigned i = 0; i < 4; ++i) {
05768       bool isZero = !(NonZeros & (1 << i));
05769       if (isZero)
05770         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
05771       else
05772         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05773     }
05774 
05775     for (unsigned i = 0; i < 2; ++i) {
05776       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
05777         default: break;
05778         case 0:
05779           V[i] = V[i*2];  // Must be a zero vector.
05780           break;
05781         case 1:
05782           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
05783           break;
05784         case 2:
05785           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
05786           break;
05787         case 3:
05788           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
05789           break;
05790       }
05791     }
05792 
05793     bool Reverse1 = (NonZeros & 0x3) == 2;
05794     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
05795     int MaskVec[] = {
05796       Reverse1 ? 1 : 0,
05797       Reverse1 ? 0 : 1,
05798       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
05799       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
05800     };
05801     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
05802   }
05803 
05804   if (Values.size() > 1 && VT.is128BitVector()) {
05805     // Check for a build vector of consecutive loads.
05806     for (unsigned i = 0; i < NumElems; ++i)
05807       V[i] = Op.getOperand(i);
05808 
05809     // Check for elements which are consecutive loads.
05810     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
05811       return LD;
05812 
05813     // Check for a build vector from mostly shuffle plus few inserting.
05814     if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
05815       return Sh;
05816 
05817     // For SSE 4.1, use insertps to put the high elements into the low element.
05818     if (Subtarget->hasSSE41()) {
05819       SDValue Result;
05820       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
05821         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
05822       else
05823         Result = DAG.getUNDEF(VT);
05824 
05825       for (unsigned i = 1; i < NumElems; ++i) {
05826         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
05827         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
05828                              Op.getOperand(i), DAG.getIntPtrConstant(i));
05829       }
05830       return Result;
05831     }
05832 
05833     // Otherwise, expand into a number of unpckl*, start by extending each of
05834     // our (non-undef) elements to the full vector width with the element in the
05835     // bottom slot of the vector (which generates no code for SSE).
05836     for (unsigned i = 0; i < NumElems; ++i) {
05837       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
05838         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
05839       else
05840         V[i] = DAG.getUNDEF(VT);
05841     }
05842 
05843     // Next, we iteratively mix elements, e.g. for v4f32:
05844     //   Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
05845     //         : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
05846     //   Step 2: unpcklps X, Y ==>    <3, 2, 1, 0>
05847     unsigned EltStride = NumElems >> 1;
05848     while (EltStride != 0) {
05849       for (unsigned i = 0; i < EltStride; ++i) {
05850         // If V[i+EltStride] is undef and this is the first round of mixing,
05851         // then it is safe to just drop this shuffle: V[i] is already in the
05852         // right place, the one element (since it's the first round) being
05853         // inserted as undef can be dropped.  This isn't safe for successive
05854         // rounds because they will permute elements within both vectors.
05855         if (V[i+EltStride].getOpcode() == ISD::UNDEF &&
05856             EltStride == NumElems/2)
05857           continue;
05858 
05859         V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]);
05860       }
05861       EltStride >>= 1;
05862     }
05863     return V[0];
05864   }
05865   return SDValue();
05866 }
05867 
05868 // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction
05869 // to create 256-bit vectors from two other 128-bit ones.
05870 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
05871   SDLoc dl(Op);
05872   MVT ResVT = Op.getSimpleValueType();
05873 
05874   assert((ResVT.is256BitVector() ||
05875           ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
05876 
05877   SDValue V1 = Op.getOperand(0);
05878   SDValue V2 = Op.getOperand(1);
05879   unsigned NumElems = ResVT.getVectorNumElements();
05880   if(ResVT.is256BitVector())
05881     return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05882 
05883   if (Op.getNumOperands() == 4) {
05884     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
05885                                 ResVT.getVectorNumElements()/2);
05886     SDValue V3 = Op.getOperand(2);
05887     SDValue V4 = Op.getOperand(3);
05888     return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl),
05889       Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl);
05890   }
05891   return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
05892 }
05893 
05894 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
05895                                        const X86Subtarget *Subtarget,
05896                                        SelectionDAG & DAG) {
05897   SDLoc dl(Op);
05898   MVT ResVT = Op.getSimpleValueType();
05899   unsigned NumOfOperands = Op.getNumOperands();
05900 
05901   assert(isPowerOf2_32(NumOfOperands) &&
05902          "Unexpected number of operands in CONCAT_VECTORS");
05903 
05904   if (NumOfOperands > 2) {
05905     MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(),
05906                                   ResVT.getVectorNumElements()/2);
05907     SmallVector<SDValue, 2> Ops;
05908     for (unsigned i = 0; i < NumOfOperands/2; i++)
05909       Ops.push_back(Op.getOperand(i));
05910     SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
05911     Ops.clear();
05912     for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
05913       Ops.push_back(Op.getOperand(i));
05914     SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
05915     return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
05916   }
05917 
05918   SDValue V1 = Op.getOperand(0);
05919   SDValue V2 = Op.getOperand(1);
05920   bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
05921   bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
05922 
05923   if (IsZeroV1 && IsZeroV2)
05924     return getZeroVector(ResVT, Subtarget, DAG, dl);
05925 
05926   SDValue ZeroIdx = DAG.getIntPtrConstant(0);
05927   SDValue Undef = DAG.getUNDEF(ResVT);
05928   unsigned NumElems = ResVT.getVectorNumElements();
05929   SDValue ShiftBits = DAG.getConstant(NumElems/2, MVT::i8);
05930 
05931   V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, ZeroIdx);
05932   V2 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V2, ShiftBits);
05933   if (IsZeroV1)
05934     return V2;
05935 
05936   V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
05937   // Zero the upper bits of V1
05938   V1 = DAG.getNode(X86ISD::VSHLI, dl, ResVT, V1, ShiftBits);
05939   V1 = DAG.getNode(X86ISD::VSRLI, dl, ResVT, V1, ShiftBits);
05940   if (IsZeroV2)
05941     return V1;
05942   return DAG.getNode(ISD::OR, dl, ResVT, V1, V2);
05943 }
05944 
05945 static SDValue LowerCONCAT_VECTORS(SDValue Op,
05946                                    const X86Subtarget *Subtarget,
05947                                    SelectionDAG &DAG) {
05948   MVT VT = Op.getSimpleValueType();
05949   if (VT.getVectorElementType() == MVT::i1)
05950     return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
05951 
05952   assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
05953          (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
05954           Op.getNumOperands() == 4)));
05955 
05956   // AVX can use the vinsertf128 instruction to create 256-bit vectors
05957   // from two other 128-bit ones.
05958 
05959   // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
05960   return LowerAVXCONCAT_VECTORS(Op, DAG);
05961 }
05962 
05963 
05964 //===----------------------------------------------------------------------===//
05965 // Vector shuffle lowering
05966 //
05967 // This is an experimental code path for lowering vector shuffles on x86. It is
05968 // designed to handle arbitrary vector shuffles and blends, gracefully
05969 // degrading performance as necessary. It works hard to recognize idiomatic
05970 // shuffles and lower them to optimal instruction patterns without leaving
05971 // a framework that allows reasonably efficient handling of all vector shuffle
05972 // patterns.
05973 //===----------------------------------------------------------------------===//
05974 
05975 /// \brief Tiny helper function to identify a no-op mask.
05976 ///
05977 /// This is a somewhat boring predicate function. It checks whether the mask
05978 /// array input, which is assumed to be a single-input shuffle mask of the kind
05979 /// used by the X86 shuffle instructions (not a fully general
05980 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
05981 /// in-place shuffle are 'no-op's.
05982 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
05983   for (int i = 0, Size = Mask.size(); i < Size; ++i)
05984     if (Mask[i] != -1 && Mask[i] != i)
05985       return false;
05986   return true;
05987 }
05988 
05989 /// \brief Helper function to classify a mask as a single-input mask.
05990 ///
05991 /// This isn't a generic single-input test because in the vector shuffle
05992 /// lowering we canonicalize single inputs to be the first input operand. This
05993 /// means we can more quickly test for a single input by only checking whether
05994 /// an input from the second operand exists. We also assume that the size of
05995 /// mask corresponds to the size of the input vectors which isn't true in the
05996 /// fully general case.
05997 static bool isSingleInputShuffleMask(ArrayRef<int> Mask) {
05998   for (int M : Mask)
05999     if (M >= (int)Mask.size())
06000       return false;
06001   return true;
06002 }
06003 
06004 /// \brief Test whether there are elements crossing 128-bit lanes in this
06005 /// shuffle mask.
06006 ///
06007 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
06008 /// and we routinely test for these.
06009 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
06010   int LaneSize = 128 / VT.getScalarSizeInBits();
06011   int Size = Mask.size();
06012   for (int i = 0; i < Size; ++i)
06013     if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
06014       return true;
06015   return false;
06016 }
06017 
06018 /// \brief Test whether a shuffle mask is equivalent within each 128-bit lane.
06019 ///
06020 /// This checks a shuffle mask to see if it is performing the same
06021 /// 128-bit lane-relative shuffle in each 128-bit lane. This trivially implies
06022 /// that it is also not lane-crossing. It may however involve a blend from the
06023 /// same lane of a second vector.
06024 ///
06025 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
06026 /// non-trivial to compute in the face of undef lanes. The representation is
06027 /// *not* suitable for use with existing 128-bit shuffles as it will contain
06028 /// entries from both V1 and V2 inputs to the wider mask.
06029 static bool
06030 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
06031                                 SmallVectorImpl<int> &RepeatedMask) {
06032   int LaneSize = 128 / VT.getScalarSizeInBits();
06033   RepeatedMask.resize(LaneSize, -1);
06034   int Size = Mask.size();
06035   for (int i = 0; i < Size; ++i) {
06036     if (Mask[i] < 0)
06037       continue;
06038     if ((Mask[i] % Size) / LaneSize != i / LaneSize)
06039       // This entry crosses lanes, so there is no way to model this shuffle.
06040       return false;
06041 
06042     // Ok, handle the in-lane shuffles by detecting if and when they repeat.
06043     if (RepeatedMask[i % LaneSize] == -1)
06044       // This is the first non-undef entry in this slot of a 128-bit lane.
06045       RepeatedMask[i % LaneSize] =
06046           Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + Size;
06047     else if (RepeatedMask[i % LaneSize] + (i / LaneSize) * LaneSize != Mask[i])
06048       // Found a mismatch with the repeated mask.
06049       return false;
06050   }
06051   return true;
06052 }
06053 
06054 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
06055 /// arguments.
06056 ///
06057 /// This is a fast way to test a shuffle mask against a fixed pattern:
06058 ///
06059 ///   if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
06060 ///
06061 /// It returns true if the mask is exactly as wide as the argument list, and
06062 /// each element of the mask is either -1 (signifying undef) or the value given
06063 /// in the argument.
06064 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
06065                                 ArrayRef<int> ExpectedMask) {
06066   if (Mask.size() != ExpectedMask.size())
06067     return false;
06068 
06069   int Size = Mask.size();
06070 
06071   // If the values are build vectors, we can look through them to find
06072   // equivalent inputs that make the shuffles equivalent.
06073   auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
06074   auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
06075 
06076   for (int i = 0; i < Size; ++i)
06077     if (Mask[i] != -1 && Mask[i] != ExpectedMask[i]) {
06078       auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
06079       auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
06080       if (!MaskBV || !ExpectedBV ||
06081           MaskBV->getOperand(Mask[i] % Size) !=
06082               ExpectedBV->getOperand(ExpectedMask[i] % Size))
06083         return false;
06084     }
06085 
06086   return true;
06087 }
06088 
06089 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
06090 ///
06091 /// This helper function produces an 8-bit shuffle immediate corresponding to
06092 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
06093 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
06094 /// example.
06095 ///
06096 /// NB: We rely heavily on "undef" masks preserving the input lane.
06097 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask,
06098                                           SelectionDAG &DAG) {
06099   assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
06100   assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
06101   assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
06102   assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
06103   assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
06104 
06105   unsigned Imm = 0;
06106   Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0;
06107   Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2;
06108   Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4;
06109   Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6;
06110   return DAG.getConstant(Imm, MVT::i8);
06111 }
06112 
06113 /// \brief Try to emit a blend instruction for a shuffle using bit math.
06114 ///
06115 /// This is used as a fallback approach when first class blend instructions are
06116 /// unavailable. Currently it is only suitable for integer vectors, but could
06117 /// be generalized for floating point vectors if desirable.
06118 static SDValue lowerVectorShuffleAsBitBlend(SDLoc DL, MVT VT, SDValue V1,
06119                                             SDValue V2, ArrayRef<int> Mask,
06120                                             SelectionDAG &DAG) {
06121   assert(VT.isInteger() && "Only supports integer vector types!");
06122   MVT EltVT = VT.getScalarType();
06123   int NumEltBits = EltVT.getSizeInBits();
06124   SDValue Zero = DAG.getConstant(0, EltVT);
06125   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), EltVT);
06126   SmallVector<SDValue, 16> MaskOps;
06127   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06128     if (Mask[i] != -1 && Mask[i] != i && Mask[i] != i + Size)
06129       return SDValue(); // Shuffled input!
06130     MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
06131   }
06132 
06133   SDValue V1Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, MaskOps);
06134   V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
06135   // We have to cast V2 around.
06136   MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
06137   V2 = DAG.getNode(ISD::BITCAST, DL, VT,
06138                    DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
06139                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V1Mask),
06140                                DAG.getNode(ISD::BITCAST, DL, MaskVT, V2)));
06141   return DAG.getNode(ISD::OR, DL, VT, V1, V2);
06142 }
06143 
06144 /// \brief Try to emit a blend instruction for a shuffle.
06145 ///
06146 /// This doesn't do any checks for the availability of instructions for blending
06147 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
06148 /// be matched in the backend with the type given. What it does check for is
06149 /// that the shuffle mask is in fact a blend.
06150 static SDValue lowerVectorShuffleAsBlend(SDLoc DL, MVT VT, SDValue V1,
06151                                          SDValue V2, ArrayRef<int> Mask,
06152                                          const X86Subtarget *Subtarget,
06153                                          SelectionDAG &DAG) {
06154   unsigned BlendMask = 0;
06155   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06156     if (Mask[i] >= Size) {
06157       if (Mask[i] != i + Size)
06158         return SDValue(); // Shuffled V2 input!
06159       BlendMask |= 1u << i;
06160       continue;
06161     }
06162     if (Mask[i] >= 0 && Mask[i] != i)
06163       return SDValue(); // Shuffled V1 input!
06164   }
06165   switch (VT.SimpleTy) {
06166   case MVT::v2f64:
06167   case MVT::v4f32:
06168   case MVT::v4f64:
06169   case MVT::v8f32:
06170     return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
06171                        DAG.getConstant(BlendMask, MVT::i8));
06172 
06173   case MVT::v4i64:
06174   case MVT::v8i32:
06175     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06176     // FALLTHROUGH
06177   case MVT::v2i64:
06178   case MVT::v4i32:
06179     // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
06180     // that instruction.
06181     if (Subtarget->hasAVX2()) {
06182       // Scale the blend by the number of 32-bit dwords per element.
06183       int Scale =  VT.getScalarSizeInBits() / 32;
06184       BlendMask = 0;
06185       for (int i = 0, Size = Mask.size(); i < Size; ++i)
06186         if (Mask[i] >= Size)
06187           for (int j = 0; j < Scale; ++j)
06188             BlendMask |= 1u << (i * Scale + j);
06189 
06190       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
06191       V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06192       V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06193       return DAG.getNode(ISD::BITCAST, DL, VT,
06194                          DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
06195                                      DAG.getConstant(BlendMask, MVT::i8)));
06196     }
06197     // FALLTHROUGH
06198   case MVT::v8i16: {
06199     // For integer shuffles we need to expand the mask and cast the inputs to
06200     // v8i16s prior to blending.
06201     int Scale = 8 / VT.getVectorNumElements();
06202     BlendMask = 0;
06203     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06204       if (Mask[i] >= Size)
06205         for (int j = 0; j < Scale; ++j)
06206           BlendMask |= 1u << (i * Scale + j);
06207 
06208     V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1);
06209     V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V2);
06210     return DAG.getNode(ISD::BITCAST, DL, VT,
06211                        DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
06212                                    DAG.getConstant(BlendMask, MVT::i8)));
06213   }
06214 
06215   case MVT::v16i16: {
06216     assert(Subtarget->hasAVX2() && "256-bit integer blends require AVX2!");
06217     SmallVector<int, 8> RepeatedMask;
06218     if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
06219       // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
06220       assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
06221       BlendMask = 0;
06222       for (int i = 0; i < 8; ++i)
06223         if (RepeatedMask[i] >= 16)
06224           BlendMask |= 1u << i;
06225       return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
06226                          DAG.getConstant(BlendMask, MVT::i8));
06227     }
06228   }
06229     // FALLTHROUGH
06230   case MVT::v16i8:
06231   case MVT::v32i8: {
06232     assert((VT.getSizeInBits() == 128 || Subtarget->hasAVX2()) &&
06233            "256-bit byte-blends require AVX2 support!");
06234 
06235     // Scale the blend by the number of bytes per element.
06236     int Scale = VT.getScalarSizeInBits() / 8;
06237 
06238     // This form of blend is always done on bytes. Compute the byte vector
06239     // type.
06240     MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
06241 
06242     // Compute the VSELECT mask. Note that VSELECT is really confusing in the
06243     // mix of LLVM's code generator and the x86 backend. We tell the code
06244     // generator that boolean values in the elements of an x86 vector register
06245     // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
06246     // mapping a select to operand #1, and 'false' mapping to operand #2. The
06247     // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
06248     // of the element (the remaining are ignored) and 0 in that high bit would
06249     // mean operand #1 while 1 in the high bit would mean operand #2. So while
06250     // the LLVM model for boolean values in vector elements gets the relevant
06251     // bit set, it is set backwards and over constrained relative to x86's
06252     // actual model.
06253     SmallVector<SDValue, 32> VSELECTMask;
06254     for (int i = 0, Size = Mask.size(); i < Size; ++i)
06255       for (int j = 0; j < Scale; ++j)
06256         VSELECTMask.push_back(
06257             Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
06258                         : DAG.getConstant(Mask[i] < Size ? -1 : 0, MVT::i8));
06259 
06260     V1 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V1);
06261     V2 = DAG.getNode(ISD::BITCAST, DL, BlendVT, V2);
06262     return DAG.getNode(
06263         ISD::BITCAST, DL, VT,
06264         DAG.getNode(ISD::VSELECT, DL, BlendVT,
06265                     DAG.getNode(ISD::BUILD_VECTOR, DL, BlendVT, VSELECTMask),
06266                     V1, V2));
06267   }
06268 
06269   default:
06270     llvm_unreachable("Not a supported integer vector type!");
06271   }
06272 }
06273 
06274 /// \brief Try to lower as a blend of elements from two inputs followed by
06275 /// a single-input permutation.
06276 ///
06277 /// This matches the pattern where we can blend elements from two inputs and
06278 /// then reduce the shuffle to a single-input permutation.
06279 static SDValue lowerVectorShuffleAsBlendAndPermute(SDLoc DL, MVT VT, SDValue V1,
06280                                                    SDValue V2,
06281                                                    ArrayRef<int> Mask,
06282                                                    SelectionDAG &DAG) {
06283   // We build up the blend mask while checking whether a blend is a viable way
06284   // to reduce the shuffle.
06285   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06286   SmallVector<int, 32> PermuteMask(Mask.size(), -1);
06287 
06288   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06289     if (Mask[i] < 0)
06290       continue;
06291 
06292     assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
06293 
06294     if (BlendMask[Mask[i] % Size] == -1)
06295       BlendMask[Mask[i] % Size] = Mask[i];
06296     else if (BlendMask[Mask[i] % Size] != Mask[i])
06297       return SDValue(); // Can't blend in the needed input!
06298 
06299     PermuteMask[i] = Mask[i] % Size;
06300   }
06301 
06302   SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06303   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
06304 }
06305 
06306 /// \brief Generic routine to decompose a shuffle and blend into indepndent
06307 /// blends and permutes.
06308 ///
06309 /// This matches the extremely common pattern for handling combined
06310 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
06311 /// operations. It will try to pick the best arrangement of shuffles and
06312 /// blends.
06313 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(SDLoc DL, MVT VT,
06314                                                           SDValue V1,
06315                                                           SDValue V2,
06316                                                           ArrayRef<int> Mask,
06317                                                           SelectionDAG &DAG) {
06318   // Shuffle the input elements into the desired positions in V1 and V2 and
06319   // blend them together.
06320   SmallVector<int, 32> V1Mask(Mask.size(), -1);
06321   SmallVector<int, 32> V2Mask(Mask.size(), -1);
06322   SmallVector<int, 32> BlendMask(Mask.size(), -1);
06323   for (int i = 0, Size = Mask.size(); i < Size; ++i)
06324     if (Mask[i] >= 0 && Mask[i] < Size) {
06325       V1Mask[i] = Mask[i];
06326       BlendMask[i] = i;
06327     } else if (Mask[i] >= Size) {
06328       V2Mask[i] = Mask[i] - Size;
06329       BlendMask[i] = i + Size;
06330     }
06331 
06332   // Try to lower with the simpler initial blend strategy unless one of the
06333   // input shuffles would be a no-op. We prefer to shuffle inputs as the
06334   // shuffle may be able to fold with a load or other benefit. However, when
06335   // we'll have to do 2x as many shuffles in order to achieve this, blending
06336   // first is a better strategy.
06337   if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
06338     if (SDValue BlendPerm =
06339             lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
06340       return BlendPerm;
06341 
06342   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
06343   V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
06344   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
06345 }
06346 
06347 /// \brief Try to lower a vector shuffle as a byte rotation.
06348 ///
06349 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
06350 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
06351 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
06352 /// try to generically lower a vector shuffle through such an pattern. It
06353 /// does not check for the profitability of lowering either as PALIGNR or
06354 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
06355 /// This matches shuffle vectors that look like:
06356 ///
06357 ///   v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
06358 ///
06359 /// Essentially it concatenates V1 and V2, shifts right by some number of
06360 /// elements, and takes the low elements as the result. Note that while this is
06361 /// specified as a *right shift* because x86 is little-endian, it is a *left
06362 /// rotate* of the vector lanes.
06363 static SDValue lowerVectorShuffleAsByteRotate(SDLoc DL, MVT VT, SDValue V1,
06364                                               SDValue V2,
06365                                               ArrayRef<int> Mask,
06366                                               const X86Subtarget *Subtarget,
06367                                               SelectionDAG &DAG) {
06368   assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
06369 
06370   int NumElts = Mask.size();
06371   int NumLanes = VT.getSizeInBits() / 128;
06372   int NumLaneElts = NumElts / NumLanes;
06373 
06374   // We need to detect various ways of spelling a rotation:
06375   //   [11, 12, 13, 14, 15,  0,  1,  2]
06376   //   [-1, 12, 13, 14, -1, -1,  1, -1]
06377   //   [-1, -1, -1, -1, -1, -1,  1,  2]
06378   //   [ 3,  4,  5,  6,  7,  8,  9, 10]
06379   //   [-1,  4,  5,  6, -1, -1,  9, -1]
06380   //   [-1,  4,  5,  6, -1, -1, -1, -1]
06381   int Rotation = 0;
06382   SDValue Lo, Hi;
06383   for (int l = 0; l < NumElts; l += NumLaneElts) {
06384     for (int i = 0; i < NumLaneElts; ++i) {
06385       if (Mask[l + i] == -1)
06386         continue;
06387       assert(Mask[l + i] >= 0 && "Only -1 is a valid negative mask element!");
06388 
06389       // Get the mod-Size index and lane correct it.
06390       int LaneIdx = (Mask[l + i] % NumElts) - l;
06391       // Make sure it was in this lane.
06392       if (LaneIdx < 0 || LaneIdx >= NumLaneElts)
06393         return SDValue();
06394 
06395       // Determine where a rotated vector would have started.
06396       int StartIdx = i - LaneIdx;
06397       if (StartIdx == 0)
06398         // The identity rotation isn't interesting, stop.
06399         return SDValue();
06400 
06401       // If we found the tail of a vector the rotation must be the missing
06402       // front. If we found the head of a vector, it must be how much of the
06403       // head.
06404       int CandidateRotation = StartIdx < 0 ? -StartIdx : NumLaneElts - StartIdx;
06405 
06406       if (Rotation == 0)
06407         Rotation = CandidateRotation;
06408       else if (Rotation != CandidateRotation)
06409         // The rotations don't match, so we can't match this mask.
06410         return SDValue();
06411 
06412       // Compute which value this mask is pointing at.
06413       SDValue MaskV = Mask[l + i] < NumElts ? V1 : V2;
06414 
06415       // Compute which of the two target values this index should be assigned
06416       // to. This reflects whether the high elements are remaining or the low
06417       // elements are remaining.
06418       SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
06419 
06420       // Either set up this value if we've not encountered it before, or check
06421       // that it remains consistent.
06422       if (!TargetV)
06423         TargetV = MaskV;
06424       else if (TargetV != MaskV)
06425         // This may be a rotation, but it pulls from the inputs in some
06426         // unsupported interleaving.
06427         return SDValue();
06428     }
06429   }
06430 
06431   // Check that we successfully analyzed the mask, and normalize the results.
06432   assert(Rotation != 0 && "Failed to locate a viable rotation!");
06433   assert((Lo || Hi) && "Failed to find a rotated input vector!");
06434   if (!Lo)
06435     Lo = Hi;
06436   else if (!Hi)
06437     Hi = Lo;
06438 
06439   // The actual rotate instruction rotates bytes, so we need to scale the
06440   // rotation based on how many bytes are in the vector lane.
06441   int Scale = 16 / NumLaneElts;
06442 
06443   // SSSE3 targets can use the palignr instruction.
06444   if (Subtarget->hasSSSE3()) {
06445     // Cast the inputs to i8 vector of correct length to match PALIGNR.
06446     MVT AlignVT = MVT::getVectorVT(MVT::i8, 16 * NumLanes);
06447     Lo = DAG.getNode(ISD::BITCAST, DL, AlignVT, Lo);
06448     Hi = DAG.getNode(ISD::BITCAST, DL, AlignVT, Hi);
06449 
06450     return DAG.getNode(ISD::BITCAST, DL, VT,
06451                        DAG.getNode(X86ISD::PALIGNR, DL, AlignVT, Hi, Lo,
06452                                    DAG.getConstant(Rotation * Scale, MVT::i8)));
06453   }
06454 
06455   assert(VT.getSizeInBits() == 128 &&
06456          "Rotate-based lowering only supports 128-bit lowering!");
06457   assert(Mask.size() <= 16 &&
06458          "Can shuffle at most 16 bytes in a 128-bit vector!");
06459 
06460   // Default SSE2 implementation
06461   int LoByteShift = 16 - Rotation * Scale;
06462   int HiByteShift = Rotation * Scale;
06463 
06464   // Cast the inputs to v2i64 to match PSLLDQ/PSRLDQ.
06465   Lo = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Lo);
06466   Hi = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Hi);
06467 
06468   SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v2i64, Lo,
06469                                 DAG.getConstant(LoByteShift, MVT::i8));
06470   SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v2i64, Hi,
06471                                 DAG.getConstant(HiByteShift, MVT::i8));
06472   return DAG.getNode(ISD::BITCAST, DL, VT,
06473                      DAG.getNode(ISD::OR, DL, MVT::v2i64, LoShift, HiShift));
06474 }
06475 
06476 /// \brief Compute whether each element of a shuffle is zeroable.
06477 ///
06478 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
06479 /// Either it is an undef element in the shuffle mask, the element of the input
06480 /// referenced is undef, or the element of the input referenced is known to be
06481 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
06482 /// as many lanes with this technique as possible to simplify the remaining
06483 /// shuffle.
06484 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
06485                                                      SDValue V1, SDValue V2) {
06486   SmallBitVector Zeroable(Mask.size(), false);
06487 
06488   while (V1.getOpcode() == ISD::BITCAST)
06489     V1 = V1->getOperand(0);
06490   while (V2.getOpcode() == ISD::BITCAST)
06491     V2 = V2->getOperand(0);
06492 
06493   bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
06494   bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
06495 
06496   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06497     int M = Mask[i];
06498     // Handle the easy cases.
06499     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
06500       Zeroable[i] = true;
06501       continue;
06502     }
06503 
06504     // If this is an index into a build_vector node (which has the same number
06505     // of elements), dig out the input value and use it.
06506     SDValue V = M < Size ? V1 : V2;
06507     if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands())
06508       continue;
06509 
06510     SDValue Input = V.getOperand(M % Size);
06511     // The UNDEF opcode check really should be dead code here, but not quite
06512     // worth asserting on (it isn't invalid, just unexpected).
06513     if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input))
06514       Zeroable[i] = true;
06515   }
06516 
06517   return Zeroable;
06518 }
06519 
06520 /// \brief Try to emit a bitmask instruction for a shuffle.
06521 ///
06522 /// This handles cases where we can model a blend exactly as a bitmask due to
06523 /// one of the inputs being zeroable.
06524 static SDValue lowerVectorShuffleAsBitMask(SDLoc DL, MVT VT, SDValue V1,
06525                                            SDValue V2, ArrayRef<int> Mask,
06526                                            SelectionDAG &DAG) {
06527   MVT EltVT = VT.getScalarType();
06528   int NumEltBits = EltVT.getSizeInBits();
06529   MVT IntEltVT = MVT::getIntegerVT(NumEltBits);
06530   SDValue Zero = DAG.getConstant(0, IntEltVT);
06531   SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), IntEltVT);
06532   if (EltVT.isFloatingPoint()) {
06533     Zero = DAG.getNode(ISD::BITCAST, DL, EltVT, Zero);
06534     AllOnes = DAG.getNode(ISD::BITCAST, DL, EltVT, AllOnes);
06535   }
06536   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
06537   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06538   SDValue V;
06539   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
06540     if (Zeroable[i])
06541       continue;
06542     if (Mask[i] % Size != i)
06543       return SDValue(); // Not a blend.
06544     if (!V)
06545       V = Mask[i] < Size ? V1 : V2;
06546     else if (V != (Mask[i] < Size ? V1 : V2))
06547       return SDValue(); // Can only let one input through the mask.
06548 
06549     VMaskOps[i] = AllOnes;
06550   }
06551   if (!V)
06552     return SDValue(); // No non-zeroable elements!
06553 
06554   SDValue VMask = DAG.getNode(ISD::BUILD_VECTOR, DL, VT, VMaskOps);
06555   V = DAG.getNode(VT.isFloatingPoint()
06556                   ? (unsigned) X86ISD::FAND : (unsigned) ISD::AND,
06557                   DL, VT, V, VMask);
06558   return V;
06559 }
06560 
06561 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
06562 ///
06563 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
06564 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
06565 /// matches elements from one of the input vectors shuffled to the left or
06566 /// right with zeroable elements 'shifted in'. It handles both the strictly
06567 /// bit-wise element shifts and the byte shift across an entire 128-bit double
06568 /// quad word lane.
06569 ///
06570 /// PSHL : (little-endian) left bit shift.
06571 /// [ zz, 0, zz,  2 ]
06572 /// [ -1, 4, zz, -1 ]
06573 /// PSRL : (little-endian) right bit shift.
06574 /// [  1, zz,  3, zz]
06575 /// [ -1, -1,  7, zz]
06576 /// PSLLDQ : (little-endian) left byte shift
06577 /// [ zz,  0,  1,  2,  3,  4,  5,  6]
06578 /// [ zz, zz, -1, -1,  2,  3,  4, -1]
06579 /// [ zz, zz, zz, zz, zz, zz, -1,  1]
06580 /// PSRLDQ : (little-endian) right byte shift
06581 /// [  5, 6,  7, zz, zz, zz, zz, zz]
06582 /// [ -1, 5,  6,  7, zz, zz, zz, zz]
06583 /// [  1, 2, -1, -1, -1, -1, zz, zz]
06584 static SDValue lowerVectorShuffleAsShift(SDLoc DL, MVT VT, SDValue V1,
06585                                          SDValue V2, ArrayRef<int> Mask,
06586                                          SelectionDAG &DAG) {
06587   SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
06588 
06589   int Size = Mask.size();
06590   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
06591 
06592   auto CheckZeros = [&](int Shift, int Scale, bool Left) {
06593     for (int i = 0; i < Size; i += Scale)
06594       for (int j = 0; j < Shift; ++j)
06595         if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
06596           return false;
06597 
06598     return true;
06599   };
06600 
06601   auto MatchShift = [&](int Shift, int Scale, bool Left, SDValue V) {
06602     for (int i = 0; i != Size; i += Scale) {
06603       unsigned Pos = Left ? i + Shift : i;
06604       unsigned Low = Left ? i : i + Shift;
06605       unsigned Len = Scale - Shift;
06606       if (!isSequentialOrUndefInRange(Mask, Pos, Len,
06607                                       Low + (V == V1 ? 0 : Size)))
06608         return SDValue();
06609     }
06610 
06611     int ShiftEltBits = VT.getScalarSizeInBits() * Scale;
06612     bool ByteShift = ShiftEltBits > 64;
06613     unsigned OpCode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
06614                            : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
06615     int ShiftAmt = Shift * VT.getScalarSizeInBits() / (ByteShift ? 8 : 1);
06616 
06617     // Normalize the scale for byte shifts to still produce an i64 element
06618     // type.
06619     Scale = ByteShift ? Scale / 2 : Scale;
06620 
06621     // We need to round trip through the appropriate type for the shift.
06622     MVT ShiftSVT = MVT::getIntegerVT(VT.getScalarSizeInBits() * Scale);
06623     MVT ShiftVT = MVT::getVectorVT(ShiftSVT, Size / Scale);
06624     assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
06625            "Illegal integer vector type");
06626     V = DAG.getNode(ISD::BITCAST, DL, ShiftVT, V);
06627 
06628     V = DAG.getNode(OpCode, DL, ShiftVT, V, DAG.getConstant(ShiftAmt, MVT::i8));
06629     return DAG.getNode(ISD::BITCAST, DL, VT, V);
06630   };
06631 
06632   // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
06633   // keep doubling the size of the integer elements up to that. We can
06634   // then shift the elements of the integer vector by whole multiples of
06635   // their width within the elements of the larger integer vector. Test each
06636   // multiple to see if we can find a match with the moved element indices
06637   // and that the shifted in elements are all zeroable.
06638   for (int Scale = 2; Scale * VT.getScalarSizeInBits() <= 128; Scale *= 2)
06639     for (int Shift = 1; Shift != Scale; ++Shift)
06640       for (bool Left : {true, false})
06641         if (CheckZeros(Shift, Scale, Left))
06642           for (SDValue V : {V1, V2})
06643             if (SDValue Match = MatchShift(Shift, Scale, Left, V))
06644               return Match;
06645 
06646   // no match
06647   return SDValue();
06648 }
06649 
06650 /// \brief Lower a vector shuffle as a zero or any extension.
06651 ///
06652 /// Given a specific number of elements, element bit width, and extension
06653 /// stride, produce either a zero or any extension based on the available
06654 /// features of the subtarget.
06655 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
06656     SDLoc DL, MVT VT, int Scale, bool AnyExt, SDValue InputV,
06657     const X86Subtarget *Subtarget, SelectionDAG &DAG) {
06658   assert(Scale > 1 && "Need a scale to extend.");
06659   int NumElements = VT.getVectorNumElements();
06660   int EltBits = VT.getScalarSizeInBits();
06661   assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
06662          "Only 8, 16, and 32 bit elements can be extended.");
06663   assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
06664 
06665   // Found a valid zext mask! Try various lowering strategies based on the
06666   // input type and available ISA extensions.
06667   if (Subtarget->hasSSE41()) {
06668     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
06669                                  NumElements / Scale);
06670     return DAG.getNode(ISD::BITCAST, DL, VT,
06671                        DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV));
06672   }
06673 
06674   // For any extends we can cheat for larger element sizes and use shuffle
06675   // instructions that can fold with a load and/or copy.
06676   if (AnyExt && EltBits == 32) {
06677     int PSHUFDMask[4] = {0, -1, 1, -1};
06678     return DAG.getNode(
06679         ISD::BITCAST, DL, VT,
06680         DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
06681                     DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
06682                     getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG)));
06683   }
06684   if (AnyExt && EltBits == 16 && Scale > 2) {
06685     int PSHUFDMask[4] = {0, -1, 0, -1};
06686     InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
06687                          DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, InputV),
06688                          getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG));
06689     int PSHUFHWMask[4] = {1, -1, -1, -1};
06690     return DA