LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86FrameLowering.h"
00019 #include "X86InstrBuilder.h"
00020 #include "X86MachineFunctionInfo.h"
00021 #include "X86TargetMachine.h"
00022 #include "X86TargetObjectFile.h"
00023 #include "llvm/ADT/SmallBitVector.h"
00024 #include "llvm/ADT/SmallSet.h"
00025 #include "llvm/ADT/Statistic.h"
00026 #include "llvm/ADT/StringExtras.h"
00027 #include "llvm/ADT/StringSwitch.h"
00028 #include "llvm/ADT/VariadicFunction.h"
00029 #include "llvm/CodeGen/IntrinsicLowering.h"
00030 #include "llvm/CodeGen/MachineFrameInfo.h"
00031 #include "llvm/CodeGen/MachineFunction.h"
00032 #include "llvm/CodeGen/MachineInstrBuilder.h"
00033 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00034 #include "llvm/CodeGen/MachineModuleInfo.h"
00035 #include "llvm/CodeGen/MachineRegisterInfo.h"
00036 #include "llvm/IR/CallSite.h"
00037 #include "llvm/IR/CallingConv.h"
00038 #include "llvm/IR/Constants.h"
00039 #include "llvm/IR/DerivedTypes.h"
00040 #include "llvm/IR/Function.h"
00041 #include "llvm/IR/GlobalAlias.h"
00042 #include "llvm/IR/GlobalVariable.h"
00043 #include "llvm/IR/Instructions.h"
00044 #include "llvm/IR/Intrinsics.h"
00045 #include "llvm/MC/MCAsmInfo.h"
00046 #include "llvm/MC/MCContext.h"
00047 #include "llvm/MC/MCExpr.h"
00048 #include "llvm/MC/MCSymbol.h"
00049 #include "llvm/Support/CommandLine.h"
00050 #include "llvm/Support/Debug.h"
00051 #include "llvm/Support/ErrorHandling.h"
00052 #include "llvm/Support/MathExtras.h"
00053 #include "llvm/Target/TargetOptions.h"
00054 #include "X86IntrinsicsInfo.h"
00055 #include <bitset>
00056 #include <numeric>
00057 #include <cctype>
00058 using namespace llvm;
00059 
00060 #define DEBUG_TYPE "x86-isel"
00061 
00062 STATISTIC(NumTailCalls, "Number of tail calls");
00063 
00064 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00065     "x86-experimental-vector-widening-legalization", cl::init(false),
00066     cl::desc("Enable an experimental vector type legalization through widening "
00067              "rather than promotion."),
00068     cl::Hidden);
00069 
00070 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00071     "x86-experimental-vector-shuffle-lowering", cl::init(true),
00072     cl::desc("Enable an experimental vector shuffle lowering code path."),
00073     cl::Hidden);
00074 
00075 static cl::opt<bool> ExperimentalVectorShuffleLegality(
00076     "x86-experimental-vector-shuffle-legality", cl::init(false),
00077     cl::desc("Enable experimental shuffle legality based on the experimental "
00078              "shuffle lowering. Should only be used with the experimental "
00079              "shuffle lowering."),
00080     cl::Hidden);
00081 
00082 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00083     "x86-recip-refinement-steps", cl::init(1),
00084     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00085              "result of the hardware reciprocal estimate instruction."),
00086     cl::NotHidden);
00087 
00088 // Forward declarations.
00089 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00090                        SDValue V2);
00091 
00092 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00093                                 SelectionDAG &DAG, SDLoc dl,
00094                                 unsigned vectorWidth) {
00095   assert((vectorWidth == 128 || vectorWidth == 256) &&
00096          "Unsupported vector width");
00097   EVT VT = Vec.getValueType();
00098   EVT ElVT = VT.getVectorElementType();
00099   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00100   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00101                                   VT.getVectorNumElements()/Factor);
00102 
00103   // Extract from UNDEF is UNDEF.
00104   if (Vec.getOpcode() == ISD::UNDEF)
00105     return DAG.getUNDEF(ResultVT);
00106 
00107   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00108   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00109 
00110   // This is the index of the first element of the vectorWidth-bit chunk
00111   // we want.
00112   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00113                                * ElemsPerChunk);
00114 
00115   // If the input is a buildvector just emit a smaller one.
00116   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00117     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00118                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
00119                                     ElemsPerChunk));
00120 
00121   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00122   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
00123 }
00124 
00125 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00126 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00127 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00128 /// instructions or a simple subregister reference. Idx is an index in the
00129 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
00130 /// lowering EXTRACT_VECTOR_ELT operations easier.
00131 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00132                                    SelectionDAG &DAG, SDLoc dl) {
00133   assert((Vec.getValueType().is256BitVector() ||
00134           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00135   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00136 }
00137 
00138 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00139 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00140                                    SelectionDAG &DAG, SDLoc dl) {
00141   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00142   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00143 }
00144 
00145 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00146                                unsigned IdxVal, SelectionDAG &DAG,
00147                                SDLoc dl, unsigned vectorWidth) {
00148   assert((vectorWidth == 128 || vectorWidth == 256) &&
00149          "Unsupported vector width");
00150   // Inserting UNDEF is Result
00151   if (Vec.getOpcode() == ISD::UNDEF)
00152     return Result;
00153   EVT VT = Vec.getValueType();
00154   EVT ElVT = VT.getVectorElementType();
00155   EVT ResultVT = Result.getValueType();
00156 
00157   // Insert the relevant vectorWidth bits.
00158   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00159 
00160   // This is the index of the first element of the vectorWidth-bit chunk
00161   // we want.
00162   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00163                                * ElemsPerChunk);
00164 
00165   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00166   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
00167 }
00168 
00169 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00170 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00171 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00172 /// simple superregister reference.  Idx is an index in the 128 bits
00173 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
00174 /// lowering INSERT_VECTOR_ELT operations easier.
00175 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00176                                   SelectionDAG &DAG,SDLoc dl) {
00177   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00178   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00179 }
00180 
00181 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00182                                   SelectionDAG &DAG, SDLoc dl) {
00183   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00184   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00185 }
00186 
00187 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00188 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00189 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00190 /// large BUILD_VECTORS.
00191 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00192                                    unsigned NumElems, SelectionDAG &DAG,
00193                                    SDLoc dl) {
00194   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00195   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00196 }
00197 
00198 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00199                                    unsigned NumElems, SelectionDAG &DAG,
00200                                    SDLoc dl) {
00201   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00202   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00203 }
00204 
00205 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
00206     : TargetLowering(TM) {
00207   Subtarget = &TM.getSubtarget<X86Subtarget>();
00208   X86ScalarSSEf64 = Subtarget->hasSSE2();
00209   X86ScalarSSEf32 = Subtarget->hasSSE1();
00210   TD = getDataLayout();
00211 
00212   // Set up the TargetLowering object.
00213   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00214 
00215   // X86 is weird. It always uses i8 for shift amounts and setcc results.
00216   setBooleanContents(ZeroOrOneBooleanContent);
00217   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00218   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00219 
00220   // For 64-bit, since we have so many registers, use the ILP scheduler.
00221   // For 32-bit, use the register pressure specific scheduling.
00222   // For Atom, always use ILP scheduling.
00223   if (Subtarget->isAtom())
00224     setSchedulingPreference(Sched::ILP);
00225   else if (Subtarget->is64Bit())
00226     setSchedulingPreference(Sched::ILP);
00227   else
00228     setSchedulingPreference(Sched::RegPressure);
00229   const X86RegisterInfo *RegInfo =
00230       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00231   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00232 
00233   // Bypass expensive divides on Atom when compiling with O2.
00234   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00235     if (Subtarget->hasSlowDivide32())
00236       addBypassSlowDiv(32, 8);
00237     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00238       addBypassSlowDiv(64, 16);
00239   }
00240 
00241   if (Subtarget->isTargetKnownWindowsMSVC()) {
00242     // Setup Windows compiler runtime calls.
00243     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00244     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00245     setLibcallName(RTLIB::SREM_I64, "_allrem");
00246     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00247     setLibcallName(RTLIB::MUL_I64, "_allmul");
00248     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00249     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00250     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00251     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00252     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00253 
00254     // The _ftol2 runtime function has an unusual calling conv, which
00255     // is modeled by a special pseudo-instruction.
00256     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00257     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00258     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00259     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00260   }
00261 
00262   if (Subtarget->isTargetDarwin()) {
00263     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00264     setUseUnderscoreSetJmp(false);
00265     setUseUnderscoreLongJmp(false);
00266   } else if (Subtarget->isTargetWindowsGNU()) {
00267     // MS runtime is weird: it exports _setjmp, but longjmp!
00268     setUseUnderscoreSetJmp(true);
00269     setUseUnderscoreLongJmp(false);
00270   } else {
00271     setUseUnderscoreSetJmp(true);
00272     setUseUnderscoreLongJmp(true);
00273   }
00274 
00275   // Set up the register classes.
00276   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00277   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00278   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00279   if (Subtarget->is64Bit())
00280     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00281 
00282   for (MVT VT : MVT::integer_valuetypes())
00283     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
00284 
00285   // We don't accept any truncstore of integer registers.
00286   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00287   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00288   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00289   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00290   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00291   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00292 
00293   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00294 
00295   // SETOEQ and SETUNE require checking two conditions.
00296   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00297   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00298   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00299   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00300   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00301   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00302 
00303   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00304   // operation.
00305   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00306   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00307   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00308 
00309   if (Subtarget->is64Bit()) {
00310     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00311     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00312   } else if (!TM.Options.UseSoftFloat) {
00313     // We have an algorithm for SSE2->double, and we turn this into a
00314     // 64-bit FILD followed by conditional FADD for other targets.
00315     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00316     // We have an algorithm for SSE2, and we turn this into a 64-bit
00317     // FILD for other targets.
00318     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00319   }
00320 
00321   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00322   // this operation.
00323   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00324   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00325 
00326   if (!TM.Options.UseSoftFloat) {
00327     // SSE has no i16 to fp conversion, only i32
00328     if (X86ScalarSSEf32) {
00329       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00330       // f32 and f64 cases are Legal, f80 case is not
00331       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00332     } else {
00333       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00334       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00335     }
00336   } else {
00337     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00338     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00339   }
00340 
00341   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00342   // are Legal, f80 is custom lowered.
00343   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00344   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00345 
00346   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00347   // this operation.
00348   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00349   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00350 
00351   if (X86ScalarSSEf32) {
00352     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00353     // f32 and f64 cases are Legal, f80 case is not
00354     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00355   } else {
00356     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00357     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00358   }
00359 
00360   // Handle FP_TO_UINT by promoting the destination to a larger signed
00361   // conversion.
00362   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00363   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00364   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00365 
00366   if (Subtarget->is64Bit()) {
00367     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00368     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00369   } else if (!TM.Options.UseSoftFloat) {
00370     // Since AVX is a superset of SSE3, only check for SSE here.
00371     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00372       // Expand FP_TO_UINT into a select.
00373       // FIXME: We would like to use a Custom expander here eventually to do
00374       // the optimal thing for SSE vs. the default expansion in the legalizer.
00375       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00376     else
00377       // With SSE3 we can use fisttpll to convert to a signed i64; without
00378       // SSE, we're stuck with a fistpll.
00379       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00380   }
00381 
00382   if (isTargetFTOL()) {
00383     // Use the _ftol2 runtime function, which has a pseudo-instruction
00384     // to handle its weird calling convention.
00385     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00386   }
00387 
00388   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00389   if (!X86ScalarSSEf64) {
00390     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00391     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00392     if (Subtarget->is64Bit()) {
00393       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00394       // Without SSE, i64->f64 goes through memory.
00395       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00396     }
00397   }
00398 
00399   // Scalar integer divide and remainder are lowered to use operations that
00400   // produce two results, to match the available instructions. This exposes
00401   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00402   // into a single instruction.
00403   //
00404   // Scalar integer multiply-high is also lowered to use two-result
00405   // operations, to match the available instructions. However, plain multiply
00406   // (low) operations are left as Legal, as there are single-result
00407   // instructions for this in x86. Using the two-result multiply instructions
00408   // when both high and low results are needed must be arranged by dagcombine.
00409   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00410     MVT VT = IntVTs[i];
00411     setOperationAction(ISD::MULHS, VT, Expand);
00412     setOperationAction(ISD::MULHU, VT, Expand);
00413     setOperationAction(ISD::SDIV, VT, Expand);
00414     setOperationAction(ISD::UDIV, VT, Expand);
00415     setOperationAction(ISD::SREM, VT, Expand);
00416     setOperationAction(ISD::UREM, VT, Expand);
00417 
00418     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00419     setOperationAction(ISD::ADDC, VT, Custom);
00420     setOperationAction(ISD::ADDE, VT, Custom);
00421     setOperationAction(ISD::SUBC, VT, Custom);
00422     setOperationAction(ISD::SUBE, VT, Custom);
00423   }
00424 
00425   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00426   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00427   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00428   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00429   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00430   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00431   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00432   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00433   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00434   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00435   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00436   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00437   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00438   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00439   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00440   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00441   if (Subtarget->is64Bit())
00442     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00443   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00444   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00445   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00446   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00447   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00448   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00449   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00450   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00451 
00452   // Promote the i8 variants and force them on up to i32 which has a shorter
00453   // encoding.
00454   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00455   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00456   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00457   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00458   if (Subtarget->hasBMI()) {
00459     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00460     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00461     if (Subtarget->is64Bit())
00462       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00463   } else {
00464     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00465     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00466     if (Subtarget->is64Bit())
00467       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00468   }
00469 
00470   if (Subtarget->hasLZCNT()) {
00471     // When promoting the i8 variants, force them to i32 for a shorter
00472     // encoding.
00473     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00474     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00475     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00476     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00477     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00478     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00479     if (Subtarget->is64Bit())
00480       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00481   } else {
00482     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00483     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00484     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00485     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00486     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00487     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00488     if (Subtarget->is64Bit()) {
00489       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00490       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00491     }
00492   }
00493 
00494   // Special handling for half-precision floating point conversions.
00495   // If we don't have F16C support, then lower half float conversions
00496   // into library calls.
00497   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00498     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00499     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00500   }
00501 
00502   // There's never any support for operations beyond MVT::f32.
00503   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00504   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00505   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00506   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00507 
00508   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
00509   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
00510   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
00511   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00512   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00513   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00514 
00515   if (Subtarget->hasPOPCNT()) {
00516     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00517   } else {
00518     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00519     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00520     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00521     if (Subtarget->is64Bit())
00522       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00523   }
00524 
00525   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00526 
00527   if (!Subtarget->hasMOVBE())
00528     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00529 
00530   // These should be promoted to a larger select which is supported.
00531   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00532   // X86 wants to expand cmov itself.
00533   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00534   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00535   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00536   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00537   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00538   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00539   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00540   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00541   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00542   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00543   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00544   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00545   if (Subtarget->is64Bit()) {
00546     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00547     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00548   }
00549   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00550   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00551   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00552   // support continuation, user-level threading, and etc.. As a result, no
00553   // other SjLj exception interfaces are implemented and please don't build
00554   // your own exception handling based on them.
00555   // LLVM/Clang supports zero-cost DWARF exception handling.
00556   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00557   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00558 
00559   // Darwin ABI issue.
00560   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00561   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00562   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00563   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00564   if (Subtarget->is64Bit())
00565     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00566   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00567   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00568   if (Subtarget->is64Bit()) {
00569     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00570     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00571     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00572     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00573     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00574   }
00575   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00576   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00577   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00578   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00579   if (Subtarget->is64Bit()) {
00580     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00581     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00582     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00583   }
00584 
00585   if (Subtarget->hasSSE1())
00586     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00587 
00588   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00589 
00590   // Expand certain atomics
00591   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00592     MVT VT = IntVTs[i];
00593     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00594     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00595     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00596   }
00597 
00598   if (Subtarget->hasCmpxchg16b()) {
00599     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00600   }
00601 
00602   // FIXME - use subtarget debug flags
00603   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00604       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00605     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00606   }
00607 
00608   if (Subtarget->is64Bit()) {
00609     setExceptionPointerRegister(X86::RAX);
00610     setExceptionSelectorRegister(X86::RDX);
00611   } else {
00612     setExceptionPointerRegister(X86::EAX);
00613     setExceptionSelectorRegister(X86::EDX);
00614   }
00615   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00616   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00617 
00618   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00619   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00620 
00621   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00622   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00623 
00624   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00625   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00626   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00627   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00628     // TargetInfo::X86_64ABIBuiltinVaList
00629     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00630     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00631   } else {
00632     // TargetInfo::CharPtrBuiltinVaList
00633     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00634     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00635   }
00636 
00637   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00638   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00639 
00640   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00641 
00642   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00643     // f32 and f64 use SSE.
00644     // Set up the FP register classes.
00645     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00646     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00647 
00648     // Use ANDPD to simulate FABS.
00649     setOperationAction(ISD::FABS , MVT::f64, Custom);
00650     setOperationAction(ISD::FABS , MVT::f32, Custom);
00651 
00652     // Use XORP to simulate FNEG.
00653     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00654     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00655 
00656     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00657     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00658     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00659 
00660     // Lower this to FGETSIGNx86 plus an AND.
00661     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00662     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00663 
00664     // We don't support sin/cos/fmod
00665     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00666     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00667     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00668     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00669     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00670     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00671 
00672     // Expand FP immediates into loads from the stack, except for the special
00673     // cases we handle.
00674     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00675     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00676   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00677     // Use SSE for f32, x87 for f64.
00678     // Set up the FP register classes.
00679     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00680     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00681 
00682     // Use ANDPS to simulate FABS.
00683     setOperationAction(ISD::FABS , MVT::f32, Custom);
00684 
00685     // Use XORP to simulate FNEG.
00686     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00687 
00688     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00689 
00690     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00691     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00692     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00693 
00694     // We don't support sin/cos/fmod
00695     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00696     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00697     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00698 
00699     // Special cases we handle for FP constants.
00700     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00701     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00702     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00703     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00704     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00705 
00706     if (!TM.Options.UnsafeFPMath) {
00707       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00708       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00709       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00710     }
00711   } else if (!TM.Options.UseSoftFloat) {
00712     // f32 and f64 in x87.
00713     // Set up the FP register classes.
00714     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00715     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00716 
00717     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00718     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00719     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00720     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00721 
00722     if (!TM.Options.UnsafeFPMath) {
00723       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00724       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00725       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00726       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00727       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00728       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00729     }
00730     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00731     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00732     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00733     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00734     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00735     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00736     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00737     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00738   }
00739 
00740   // We don't support FMA.
00741   setOperationAction(ISD::FMA, MVT::f64, Expand);
00742   setOperationAction(ISD::FMA, MVT::f32, Expand);
00743 
00744   // Long double always uses X87.
00745   if (!TM.Options.UseSoftFloat) {
00746     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00747     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00748     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00749     {
00750       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00751       addLegalFPImmediate(TmpFlt);  // FLD0
00752       TmpFlt.changeSign();
00753       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00754 
00755       bool ignored;
00756       APFloat TmpFlt2(+1.0);
00757       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00758                       &ignored);
00759       addLegalFPImmediate(TmpFlt2);  // FLD1
00760       TmpFlt2.changeSign();
00761       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00762     }
00763 
00764     if (!TM.Options.UnsafeFPMath) {
00765       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00766       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00767       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00768     }
00769 
00770     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00771     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00772     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00773     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00774     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00775     setOperationAction(ISD::FMA, MVT::f80, Expand);
00776   }
00777 
00778   // Always use a library call for pow.
00779   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00780   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00781   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00782 
00783   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00784   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00785   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00786   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00787   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00788   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00789   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00790 
00791   // First set operation action for all vector types to either promote
00792   // (for widening) or expand (for scalarization). Then we will selectively
00793   // turn on ones that can be effectively codegen'd.
00794   for (MVT VT : MVT::vector_valuetypes()) {
00795     setOperationAction(ISD::ADD , VT, Expand);
00796     setOperationAction(ISD::SUB , VT, Expand);
00797     setOperationAction(ISD::FADD, VT, Expand);
00798     setOperationAction(ISD::FNEG, VT, Expand);
00799     setOperationAction(ISD::FSUB, VT, Expand);
00800     setOperationAction(ISD::MUL , VT, Expand);
00801     setOperationAction(ISD::FMUL, VT, Expand);
00802     setOperationAction(ISD::SDIV, VT, Expand);
00803     setOperationAction(ISD::UDIV, VT, Expand);
00804     setOperationAction(ISD::FDIV, VT, Expand);
00805     setOperationAction(ISD::SREM, VT, Expand);
00806     setOperationAction(ISD::UREM, VT, Expand);
00807     setOperationAction(ISD::LOAD, VT, Expand);
00808     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00809     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00810     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00811     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00812     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00813     setOperationAction(ISD::FABS, VT, Expand);
00814     setOperationAction(ISD::FSIN, VT, Expand);
00815     setOperationAction(ISD::FSINCOS, VT, Expand);
00816     setOperationAction(ISD::FCOS, VT, Expand);
00817     setOperationAction(ISD::FSINCOS, VT, Expand);
00818     setOperationAction(ISD::FREM, VT, Expand);
00819     setOperationAction(ISD::FMA,  VT, Expand);
00820     setOperationAction(ISD::FPOWI, VT, Expand);
00821     setOperationAction(ISD::FSQRT, VT, Expand);
00822     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00823     setOperationAction(ISD::FFLOOR, VT, Expand);
00824     setOperationAction(ISD::FCEIL, VT, Expand);
00825     setOperationAction(ISD::FTRUNC, VT, Expand);
00826     setOperationAction(ISD::FRINT, VT, Expand);
00827     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00828     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00829     setOperationAction(ISD::MULHS, VT, Expand);
00830     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00831     setOperationAction(ISD::MULHU, VT, Expand);
00832     setOperationAction(ISD::SDIVREM, VT, Expand);
00833     setOperationAction(ISD::UDIVREM, VT, Expand);
00834     setOperationAction(ISD::FPOW, VT, Expand);
00835     setOperationAction(ISD::CTPOP, VT, Expand);
00836     setOperationAction(ISD::CTTZ, VT, Expand);
00837     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00838     setOperationAction(ISD::CTLZ, VT, Expand);
00839     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00840     setOperationAction(ISD::SHL, VT, Expand);
00841     setOperationAction(ISD::SRA, VT, Expand);
00842     setOperationAction(ISD::SRL, VT, Expand);
00843     setOperationAction(ISD::ROTL, VT, Expand);
00844     setOperationAction(ISD::ROTR, VT, Expand);
00845     setOperationAction(ISD::BSWAP, VT, Expand);
00846     setOperationAction(ISD::SETCC, VT, Expand);
00847     setOperationAction(ISD::FLOG, VT, Expand);
00848     setOperationAction(ISD::FLOG2, VT, Expand);
00849     setOperationAction(ISD::FLOG10, VT, Expand);
00850     setOperationAction(ISD::FEXP, VT, Expand);
00851     setOperationAction(ISD::FEXP2, VT, Expand);
00852     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00853     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00854     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00855     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00856     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00857     setOperationAction(ISD::TRUNCATE, VT, Expand);
00858     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00859     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00860     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00861     setOperationAction(ISD::VSELECT, VT, Expand);
00862     setOperationAction(ISD::SELECT_CC, VT, Expand);
00863     for (MVT InnerVT : MVT::vector_valuetypes()) {
00864       setTruncStoreAction(InnerVT, VT, Expand);
00865 
00866       setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
00867       setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
00868 
00869       // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
00870       // types, we have to deal with them whether we ask for Expansion or not.
00871       // Setting Expand causes its own optimisation problems though, so leave
00872       // them legal.
00873       if (VT.getVectorElementType() == MVT::i1)
00874         setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
00875     }
00876   }
00877 
00878   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00879   // with -msoft-float, disable use of MMX as well.
00880   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00881     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00882     // No operations on x86mmx supported, everything uses intrinsics.
00883   }
00884 
00885   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00886   // into smaller operations.
00887   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00888   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00889   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00890   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00891   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00892   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00893   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00894   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00895   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00896   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00897   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00898   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00899   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00900   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00901   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00902   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00903   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00904   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00905   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00906   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00907   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00908   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00909   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00910   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00911   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00912   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00913   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00914   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00915   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00916 
00917   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00918     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00919 
00920     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00921     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00922     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00923     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00924     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00925     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00926     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00927     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00928     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00929     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00930     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00931     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00932     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00933   }
00934 
00935   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00936     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00937 
00938     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00939     // registers cannot be used even for integer operations.
00940     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00941     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00942     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00943     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00944 
00945     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00946     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00947     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00948     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00949     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00950     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00951     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00952     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00953     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00954     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00955     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00956     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00957     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00958     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00959     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00960     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00961     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00962     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00963     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00964     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00965     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00966     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00967 
00968     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00969     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00970     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00971     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00972 
00973     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00974     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00975     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00976     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00977     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00978 
00979     // Only provide customized ctpop vector bit twiddling for vector types we
00980     // know to perform better than using the popcnt instructions on each vector
00981     // element. If popcnt isn't supported, always provide the custom version.
00982     if (!Subtarget->hasPOPCNT()) {
00983       setOperationAction(ISD::CTPOP,            MVT::v4i32, Custom);
00984       setOperationAction(ISD::CTPOP,            MVT::v2i64, Custom);
00985     }
00986 
00987     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00988     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00989       MVT VT = (MVT::SimpleValueType)i;
00990       // Do not attempt to custom lower non-power-of-2 vectors
00991       if (!isPowerOf2_32(VT.getVectorNumElements()))
00992         continue;
00993       // Do not attempt to custom lower non-128-bit vectors
00994       if (!VT.is128BitVector())
00995         continue;
00996       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
00997       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
00998       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
00999     }
01000 
01001     // We support custom legalizing of sext and anyext loads for specific
01002     // memory vector types which we can load as a scalar (or sequence of
01003     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01004     // loads these must work with a single scalar load.
01005     for (MVT VT : MVT::integer_vector_valuetypes()) {
01006       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
01007       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
01008       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
01009       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
01010       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
01011       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
01012       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
01013       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
01014       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
01015     }
01016 
01017     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01018     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01019     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01020     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01021     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01022     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01023 
01024     if (Subtarget->is64Bit()) {
01025       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01026       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01027     }
01028 
01029     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01030     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01031       MVT VT = (MVT::SimpleValueType)i;
01032 
01033       // Do not attempt to promote non-128-bit vectors
01034       if (!VT.is128BitVector())
01035         continue;
01036 
01037       setOperationAction(ISD::AND,    VT, Promote);
01038       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01039       setOperationAction(ISD::OR,     VT, Promote);
01040       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01041       setOperationAction(ISD::XOR,    VT, Promote);
01042       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01043       setOperationAction(ISD::LOAD,   VT, Promote);
01044       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01045       setOperationAction(ISD::SELECT, VT, Promote);
01046       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01047     }
01048 
01049     // Custom lower v2i64 and v2f64 selects.
01050     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01051     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01052     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01053     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01054 
01055     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01056     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01057 
01058     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01059     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01060     // As there is no 64-bit GPR available, we need build a special custom
01061     // sequence to convert from v2i32 to v2f32.
01062     if (!Subtarget->is64Bit())
01063       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01064 
01065     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01066     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01067 
01068     for (MVT VT : MVT::fp_vector_valuetypes())
01069       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
01070 
01071     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01072     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01073     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01074   }
01075 
01076   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01077     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01078     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01079     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01080     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01081     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01082     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01083     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01084     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01085     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01086     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01087 
01088     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01089     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01090     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01091     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01092     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01093     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01094     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01095     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01096     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01097     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01098 
01099     // FIXME: Do we need to handle scalar-to-vector here?
01100     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01101 
01102     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01103     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01104     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01105     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01106     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01107     // There is no BLENDI for byte vectors. We don't need to custom lower
01108     // some vselects for now.
01109     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01110 
01111     // SSE41 brings specific instructions for doing vector sign extend even in
01112     // cases where we don't have SRA.
01113     for (MVT VT : MVT::integer_vector_valuetypes()) {
01114       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
01115       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
01116       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
01117     }
01118 
01119     // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
01120     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
01121     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
01122     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
01123     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
01124     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
01125     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
01126 
01127     setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8,  Legal);
01128     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8,  Legal);
01129     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8,  Legal);
01130     setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
01131     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
01132     setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
01133 
01134     // i8 and i16 vectors are custom because the source register and source
01135     // source memory operand types are not the same width.  f32 vectors are
01136     // custom since the immediate controlling the insert encodes additional
01137     // information.
01138     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01139     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01140     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01141     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01142 
01143     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01144     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01145     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01146     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01147 
01148     // FIXME: these should be Legal, but that's only for the case where
01149     // the index is constant.  For now custom expand to deal with that.
01150     if (Subtarget->is64Bit()) {
01151       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01152       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01153     }
01154   }
01155 
01156   if (Subtarget->hasSSE2()) {
01157     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01158     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01159 
01160     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01161     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01162 
01163     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01164     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01165 
01166     // In the customized shift lowering, the legal cases in AVX2 will be
01167     // recognized.
01168     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01169     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01170 
01171     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01172     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01173 
01174     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01175   }
01176 
01177   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01178     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01179     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01180     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01181     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01182     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01183     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01184 
01185     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01186     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01187     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01188 
01189     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01190     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01191     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01192     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01193     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01194     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01195     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01196     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01197     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01198     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01199     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01200     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01201 
01202     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01203     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01204     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01205     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01206     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01207     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01208     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01209     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01210     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01211     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01212     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01213     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01214 
01215     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01216     // even though v8i16 is a legal type.
01217     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01218     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01219     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01220 
01221     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01222     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01223     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01224 
01225     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01226     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01227 
01228     for (MVT VT : MVT::fp_vector_valuetypes())
01229       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
01230 
01231     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01232     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01233 
01234     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01235     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01236 
01237     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01238     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01239 
01240     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01241     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01242     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01243     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01244 
01245     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01246     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01247     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01248 
01249     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01250     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01251     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01252     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01253 
01254     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01255     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01256     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01257     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01258     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01259     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01260     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01261     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01262     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01263     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01264     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01265     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01266 
01267     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01268       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01269       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01270       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01271       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01272       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01273       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01274     }
01275 
01276     if (Subtarget->hasInt256()) {
01277       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01278       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01279       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01280       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01281 
01282       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01283       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01284       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01285       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01286 
01287       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01288       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01289       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01290       // Don't lower v32i8 because there is no 128-bit byte mul
01291 
01292       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01293       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01294       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01295       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01296 
01297       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01298       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01299 
01300       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01301       // when we have a 256bit-wide blend with immediate.
01302       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01303 
01304       // Only provide customized ctpop vector bit twiddling for vector types we
01305       // know to perform better than using the popcnt instructions on each
01306       // vector element. If popcnt isn't supported, always provide the custom
01307       // version.
01308       if (!Subtarget->hasPOPCNT())
01309         setOperationAction(ISD::CTPOP,           MVT::v4i64, Custom);
01310 
01311       // Custom CTPOP always performs better on natively supported v8i32
01312       setOperationAction(ISD::CTPOP,             MVT::v8i32, Custom);
01313 
01314       // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
01315       setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01316       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01317       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01318       setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01319       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01320       setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01321 
01322       setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
01323       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i8,  Legal);
01324       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i8,  Legal);
01325       setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32,  MVT::v8i16, Legal);
01326       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i16, Legal);
01327       setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64,  MVT::v4i32, Legal);
01328     } else {
01329       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01330       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01331       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01332       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01333 
01334       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01335       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01336       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01337       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01338 
01339       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01340       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01341       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01342       // Don't lower v32i8 because there is no 128-bit byte mul
01343     }
01344 
01345     // In the customized shift lowering, the legal cases in AVX2 will be
01346     // recognized.
01347     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01348     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01349 
01350     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01351     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01352 
01353     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01354 
01355     // Custom lower several nodes for 256-bit types.
01356     for (MVT VT : MVT::vector_valuetypes()) {
01357       if (VT.getScalarSizeInBits() >= 32) {
01358         setOperationAction(ISD::MLOAD,  VT, Legal);
01359         setOperationAction(ISD::MSTORE, VT, Legal);
01360       }
01361       // Extract subvector is special because the value type
01362       // (result) is 128-bit but the source is 256-bit wide.
01363       if (VT.is128BitVector()) {
01364         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01365       }
01366       // Do not attempt to custom lower other non-256-bit vectors
01367       if (!VT.is256BitVector())
01368         continue;
01369 
01370       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01371       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01372       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01373       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01374       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01375       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01376       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01377     }
01378 
01379     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01380     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01381       MVT VT = (MVT::SimpleValueType)i;
01382 
01383       // Do not attempt to promote non-256-bit vectors
01384       if (!VT.is256BitVector())
01385         continue;
01386 
01387       setOperationAction(ISD::AND,    VT, Promote);
01388       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01389       setOperationAction(ISD::OR,     VT, Promote);
01390       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01391       setOperationAction(ISD::XOR,    VT, Promote);
01392       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01393       setOperationAction(ISD::LOAD,   VT, Promote);
01394       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01395       setOperationAction(ISD::SELECT, VT, Promote);
01396       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01397     }
01398   }
01399 
01400   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01401     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01402     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01403     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01404     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01405 
01406     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01407     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01408     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01409 
01410     for (MVT VT : MVT::fp_vector_valuetypes())
01411       setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
01412 
01413     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01414     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01415     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01416     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01417     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01418     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01419     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01420     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01421     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01422     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01423 
01424     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01425     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01426     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01427     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01428     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01429     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01430 
01431     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01432     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01433     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01434     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01435     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01436     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01437     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01438     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01439 
01440     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01441     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01442     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01443     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01444     if (Subtarget->is64Bit()) {
01445       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01446       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01447       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01448       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01449     }
01450     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01451     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01452     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01453     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01454     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01455     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01456     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01457     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01458     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01459     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01460     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01461     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01462     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01463     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01464 
01465     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01466     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01467     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01468     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01469     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01470     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01471     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01472     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01473     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01474     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01475     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01476     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01477     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01478 
01479     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01480     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01481     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01482     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01483     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01484     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01485 
01486     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01487     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01488 
01489     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01490 
01491     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01492     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01493     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01494     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01495     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01496     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01497     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01498     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01499     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01500 
01501     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01502     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01503 
01504     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01505     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01506 
01507     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01508 
01509     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01510     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01511 
01512     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01513     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01514 
01515     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01516     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01517 
01518     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01519     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01520     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01521     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01522     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01523     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01524 
01525     if (Subtarget->hasCDI()) {
01526       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01527       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01528     }
01529 
01530     // Custom lower several nodes.
01531     for (MVT VT : MVT::vector_valuetypes()) {
01532       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01533       // Extract subvector is special because the value type
01534       // (result) is 256/128-bit but the source is 512-bit wide.
01535       if (VT.is128BitVector() || VT.is256BitVector()) {
01536         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01537       }
01538       if (VT.getVectorElementType() == MVT::i1)
01539         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01540 
01541       // Do not attempt to custom lower other non-512-bit vectors
01542       if (!VT.is512BitVector())
01543         continue;
01544 
01545       if ( EltSize >= 32) {
01546         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01547         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01548         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01549         setOperationAction(ISD::VSELECT,             VT, Legal);
01550         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01551         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01552         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01553         setOperationAction(ISD::MLOAD,               VT, Legal);
01554         setOperationAction(ISD::MSTORE,              VT, Legal);
01555       }
01556     }
01557     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01558       MVT VT = (MVT::SimpleValueType)i;
01559 
01560       // Do not attempt to promote non-512-bit vectors.
01561       if (!VT.is512BitVector())
01562         continue;
01563 
01564       setOperationAction(ISD::SELECT, VT, Promote);
01565       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01566     }
01567   }// has  AVX-512
01568 
01569   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01570     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01571     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01572 
01573     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01574     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01575 
01576     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01577     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01578     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01579     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01580     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
01581     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
01582     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
01583     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
01584     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
01585 
01586     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01587       const MVT VT = (MVT::SimpleValueType)i;
01588 
01589       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01590 
01591       // Do not attempt to promote non-512-bit vectors.
01592       if (!VT.is512BitVector())
01593         continue;
01594 
01595       if (EltSize < 32) {
01596         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01597         setOperationAction(ISD::VSELECT,             VT, Legal);
01598       }
01599     }
01600   }
01601 
01602   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01603     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01604     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01605 
01606     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01607     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01608     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01609 
01610     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
01611     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
01612     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
01613     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
01614     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
01615     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
01616   }
01617 
01618   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01619   // of this type with custom code.
01620   for (MVT VT : MVT::vector_valuetypes())
01621     setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
01622 
01623   // We want to custom lower some of our intrinsics.
01624   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01625   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01626   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01627   if (!Subtarget->is64Bit())
01628     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01629 
01630   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01631   // handle type legalization for these operations here.
01632   //
01633   // FIXME: We really should do custom legalization for addition and
01634   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01635   // than generic legalization for 64-bit multiplication-with-overflow, though.
01636   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01637     // Add/Sub/Mul with overflow operations are custom lowered.
01638     MVT VT = IntVTs[i];
01639     setOperationAction(ISD::SADDO, VT, Custom);
01640     setOperationAction(ISD::UADDO, VT, Custom);
01641     setOperationAction(ISD::SSUBO, VT, Custom);
01642     setOperationAction(ISD::USUBO, VT, Custom);
01643     setOperationAction(ISD::SMULO, VT, Custom);
01644     setOperationAction(ISD::UMULO, VT, Custom);
01645   }
01646 
01647 
01648   if (!Subtarget->is64Bit()) {
01649     // These libcalls are not available in 32-bit.
01650     setLibcallName(RTLIB::SHL_I128, nullptr);
01651     setLibcallName(RTLIB::SRL_I128, nullptr);
01652     setLibcallName(RTLIB::SRA_I128, nullptr);
01653   }
01654 
01655   // Combine sin / cos into one node or libcall if possible.
01656   if (Subtarget->hasSinCos()) {
01657     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01658     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01659     if (Subtarget->isTargetDarwin()) {
01660       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
01661       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
01662       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01663       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01664     }
01665   }
01666 
01667   if (Subtarget->isTargetWin64()) {
01668     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01669     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01670     setOperationAction(ISD::SREM, MVT::i128, Custom);
01671     setOperationAction(ISD::UREM, MVT::i128, Custom);
01672     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01673     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01674   }
01675 
01676   // We have target-specific dag combine patterns for the following nodes:
01677   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01678   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01679   setTargetDAGCombine(ISD::VSELECT);
01680   setTargetDAGCombine(ISD::SELECT);
01681   setTargetDAGCombine(ISD::SHL);
01682   setTargetDAGCombine(ISD::SRA);
01683   setTargetDAGCombine(ISD::SRL);
01684   setTargetDAGCombine(ISD::OR);
01685   setTargetDAGCombine(ISD::AND);
01686   setTargetDAGCombine(ISD::ADD);
01687   setTargetDAGCombine(ISD::FADD);
01688   setTargetDAGCombine(ISD::FSUB);
01689   setTargetDAGCombine(ISD::FMA);
01690   setTargetDAGCombine(ISD::SUB);
01691   setTargetDAGCombine(ISD::LOAD);
01692   setTargetDAGCombine(ISD::MLOAD);
01693   setTargetDAGCombine(ISD::STORE);
01694   setTargetDAGCombine(ISD::MSTORE);
01695   setTargetDAGCombine(ISD::ZERO_EXTEND);
01696   setTargetDAGCombine(ISD::ANY_EXTEND);
01697   setTargetDAGCombine(ISD::SIGN_EXTEND);
01698   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01699   setTargetDAGCombine(ISD::TRUNCATE);
01700   setTargetDAGCombine(ISD::SINT_TO_FP);
01701   setTargetDAGCombine(ISD::SETCC);
01702   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01703   setTargetDAGCombine(ISD::BUILD_VECTOR);
01704   setTargetDAGCombine(ISD::MUL);
01705   setTargetDAGCombine(ISD::XOR);
01706 
01707   computeRegisterProperties();
01708 
01709   // On Darwin, -Os means optimize for size without hurting performance,
01710   // do not reduce the limit.
01711   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01712   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01713   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01714   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01715   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01716   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01717   setPrefLoopAlignment(4); // 2^4 bytes.
01718 
01719   // Predictable cmov don't hurt on atom because it's in-order.
01720   PredictableSelectIsExpensive = !Subtarget->isAtom();
01721   EnableExtLdPromotion = true;
01722   setPrefFunctionAlignment(4); // 2^4 bytes.
01723 
01724   verifyIntrinsicTables();
01725 }
01726 
01727 // This has so far only been implemented for 64-bit MachO.
01728 bool X86TargetLowering::useLoadStackGuardNode() const {
01729   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
01730 }
01731 
01732 TargetLoweringBase::LegalizeTypeAction
01733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01734   if (ExperimentalVectorWideningLegalization &&
01735       VT.getVectorNumElements() != 1 &&
01736       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01737     return TypeWidenVector;
01738 
01739   return TargetLoweringBase::getPreferredVectorAction(VT);
01740 }
01741 
01742 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01743   if (!VT.isVector())
01744     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01745 
01746   const unsigned NumElts = VT.getVectorNumElements();
01747   const EVT EltVT = VT.getVectorElementType();
01748   if (VT.is512BitVector()) {
01749     if (Subtarget->hasAVX512())
01750       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01751           EltVT == MVT::f32 || EltVT == MVT::f64)
01752         switch(NumElts) {
01753         case  8: return MVT::v8i1;
01754         case 16: return MVT::v16i1;
01755       }
01756     if (Subtarget->hasBWI())
01757       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01758         switch(NumElts) {
01759         case 32: return MVT::v32i1;
01760         case 64: return MVT::v64i1;
01761       }
01762   }
01763 
01764   if (VT.is256BitVector() || VT.is128BitVector()) {
01765     if (Subtarget->hasVLX())
01766       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01767           EltVT == MVT::f32 || EltVT == MVT::f64)
01768         switch(NumElts) {
01769         case 2: return MVT::v2i1;
01770         case 4: return MVT::v4i1;
01771         case 8: return MVT::v8i1;
01772       }
01773     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01774       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01775         switch(NumElts) {
01776         case  8: return MVT::v8i1;
01777         case 16: return MVT::v16i1;
01778         case 32: return MVT::v32i1;
01779       }
01780   }
01781 
01782   return VT.changeVectorElementTypeToInteger();
01783 }
01784 
01785 /// Helper for getByValTypeAlignment to determine
01786 /// the desired ByVal argument alignment.
01787 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01788   if (MaxAlign == 16)
01789     return;
01790   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01791     if (VTy->getBitWidth() == 128)
01792       MaxAlign = 16;
01793   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01794     unsigned EltAlign = 0;
01795     getMaxByValAlign(ATy->getElementType(), EltAlign);
01796     if (EltAlign > MaxAlign)
01797       MaxAlign = EltAlign;
01798   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01799     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01800       unsigned EltAlign = 0;
01801       getMaxByValAlign(STy->getElementType(i), EltAlign);
01802       if (EltAlign > MaxAlign)
01803         MaxAlign = EltAlign;
01804       if (MaxAlign == 16)
01805         break;
01806     }
01807   }
01808 }
01809 
01810 /// Return the desired alignment for ByVal aggregate
01811 /// function arguments in the caller parameter area. For X86, aggregates
01812 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01813 /// are at 4-byte boundaries.
01814 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01815   if (Subtarget->is64Bit()) {
01816     // Max of 8 and alignment of type.
01817     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01818     if (TyAlign > 8)
01819       return TyAlign;
01820     return 8;
01821   }
01822 
01823   unsigned Align = 4;
01824   if (Subtarget->hasSSE1())
01825     getMaxByValAlign(Ty, Align);
01826   return Align;
01827 }
01828 
01829 /// Returns the target specific optimal type for load
01830 /// and store operations as a result of memset, memcpy, and memmove
01831 /// lowering. If DstAlign is zero that means it's safe to destination
01832 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01833 /// means there isn't a need to check it against alignment requirement,
01834 /// probably because the source does not need to be loaded. If 'IsMemset' is
01835 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01836 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01837 /// source is constant so it does not need to be loaded.
01838 /// It returns EVT::Other if the type should be determined using generic
01839 /// target-independent logic.
01840 EVT
01841 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01842                                        unsigned DstAlign, unsigned SrcAlign,
01843                                        bool IsMemset, bool ZeroMemset,
01844                                        bool MemcpyStrSrc,
01845                                        MachineFunction &MF) const {
01846   const Function *F = MF.getFunction();
01847   if ((!IsMemset || ZeroMemset) &&
01848       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01849                                        Attribute::NoImplicitFloat)) {
01850     if (Size >= 16 &&
01851         (Subtarget->isUnalignedMemAccessFast() ||
01852          ((DstAlign == 0 || DstAlign >= 16) &&
01853           (SrcAlign == 0 || SrcAlign >= 16)))) {
01854       if (Size >= 32) {
01855         if (Subtarget->hasInt256())
01856           return MVT::v8i32;
01857         if (Subtarget->hasFp256())
01858           return MVT::v8f32;
01859       }
01860       if (Subtarget->hasSSE2())
01861         return MVT::v4i32;
01862       if (Subtarget->hasSSE1())
01863         return MVT::v4f32;
01864     } else if (!MemcpyStrSrc && Size >= 8 &&
01865                !Subtarget->is64Bit() &&
01866                Subtarget->hasSSE2()) {
01867       // Do not use f64 to lower memcpy if source is string constant. It's
01868       // better to use i32 to avoid the loads.
01869       return MVT::f64;
01870     }
01871   }
01872   if (Subtarget->is64Bit() && Size >= 8)
01873     return MVT::i64;
01874   return MVT::i32;
01875 }
01876 
01877 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01878   if (VT == MVT::f32)
01879     return X86ScalarSSEf32;
01880   else if (VT == MVT::f64)
01881     return X86ScalarSSEf64;
01882   return true;
01883 }
01884 
01885 bool
01886 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01887                                                   unsigned,
01888                                                   unsigned,
01889                                                   bool *Fast) const {
01890   if (Fast)
01891     *Fast = Subtarget->isUnalignedMemAccessFast();
01892   return true;
01893 }
01894 
01895 /// Return the entry encoding for a jump table in the
01896 /// current function.  The returned value is a member of the
01897 /// MachineJumpTableInfo::JTEntryKind enum.
01898 unsigned X86TargetLowering::getJumpTableEncoding() const {
01899   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01900   // symbol.
01901   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01902       Subtarget->isPICStyleGOT())
01903     return MachineJumpTableInfo::EK_Custom32;
01904 
01905   // Otherwise, use the normal jump table encoding heuristics.
01906   return TargetLowering::getJumpTableEncoding();
01907 }
01908 
01909 const MCExpr *
01910 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01911                                              const MachineBasicBlock *MBB,
01912                                              unsigned uid,MCContext &Ctx) const{
01913   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01914          Subtarget->isPICStyleGOT());
01915   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01916   // entries.
01917   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01918                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01919 }
01920 
01921 /// Returns relocation base for the given PIC jumptable.
01922 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01923                                                     SelectionDAG &DAG) const {
01924   if (!Subtarget->is64Bit())
01925     // This doesn't have SDLoc associated with it, but is not really the
01926     // same as a Register.
01927     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01928   return Table;
01929 }
01930 
01931 /// This returns the relocation base for the given PIC jumptable,
01932 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
01933 const MCExpr *X86TargetLowering::
01934 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01935                              MCContext &Ctx) const {
01936   // X86-64 uses RIP relative addressing based on the jump table label.
01937   if (Subtarget->isPICStyleRIPRel())
01938     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01939 
01940   // Otherwise, the reference is relative to the PIC base.
01941   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01942 }
01943 
01944 // FIXME: Why this routine is here? Move to RegInfo!
01945 std::pair<const TargetRegisterClass*, uint8_t>
01946 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01947   const TargetRegisterClass *RRC = nullptr;
01948   uint8_t Cost = 1;
01949   switch (VT.SimpleTy) {
01950   default:
01951     return TargetLowering::findRepresentativeClass(VT);
01952   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01953     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01954     break;
01955   case MVT::x86mmx:
01956     RRC = &X86::VR64RegClass;
01957     break;
01958   case MVT::f32: case MVT::f64:
01959   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01960   case MVT::v4f32: case MVT::v2f64:
01961   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01962   case MVT::v4f64:
01963     RRC = &X86::VR128RegClass;
01964     break;
01965   }
01966   return std::make_pair(RRC, Cost);
01967 }
01968 
01969 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01970                                                unsigned &Offset) const {
01971   if (!Subtarget->isTargetLinux())
01972     return false;
01973 
01974   if (Subtarget->is64Bit()) {
01975     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01976     Offset = 0x28;
01977     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01978       AddressSpace = 256;
01979     else
01980       AddressSpace = 257;
01981   } else {
01982     // %gs:0x14 on i386
01983     Offset = 0x14;
01984     AddressSpace = 256;
01985   }
01986   return true;
01987 }
01988 
01989 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01990                                             unsigned DestAS) const {
01991   assert(SrcAS != DestAS && "Expected different address spaces!");
01992 
01993   return SrcAS < 256 && DestAS < 256;
01994 }
01995 
01996 //===----------------------------------------------------------------------===//
01997 //               Return Value Calling Convention Implementation
01998 //===----------------------------------------------------------------------===//
01999 
02000 #include "X86GenCallingConv.inc"
02001 
02002 bool
02003 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
02004                                   MachineFunction &MF, bool isVarArg,
02005                         const SmallVectorImpl<ISD::OutputArg> &Outs,
02006                         LLVMContext &Context) const {
02007   SmallVector<CCValAssign, 16> RVLocs;
02008   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
02009   return CCInfo.CheckReturn(Outs, RetCC_X86);
02010 }
02011 
02012 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
02013   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
02014   return ScratchRegs;
02015 }
02016 
02017 SDValue
02018 X86TargetLowering::LowerReturn(SDValue Chain,
02019                                CallingConv::ID CallConv, bool isVarArg,
02020                                const SmallVectorImpl<ISD::OutputArg> &Outs,
02021                                const SmallVectorImpl<SDValue> &OutVals,
02022                                SDLoc dl, SelectionDAG &DAG) const {
02023   MachineFunction &MF = DAG.getMachineFunction();
02024   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02025 
02026   SmallVector<CCValAssign, 16> RVLocs;
02027   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
02028   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
02029 
02030   SDValue Flag;
02031   SmallVector<SDValue, 6> RetOps;
02032   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
02033   // Operand #1 = Bytes To Pop
02034   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
02035                    MVT::i16));
02036 
02037   // Copy the result values into the output registers.
02038   for (unsigned i = 0; i != RVLocs.size(); ++i) {
02039     CCValAssign &VA = RVLocs[i];
02040     assert(VA.isRegLoc() && "Can only return in registers!");
02041     SDValue ValToCopy = OutVals[i];
02042     EVT ValVT = ValToCopy.getValueType();
02043 
02044     // Promote values to the appropriate types.
02045     if (VA.getLocInfo() == CCValAssign::SExt)
02046       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02047     else if (VA.getLocInfo() == CCValAssign::ZExt)
02048       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02049     else if (VA.getLocInfo() == CCValAssign::AExt)
02050       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02051     else if (VA.getLocInfo() == CCValAssign::BCvt)
02052       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02053 
02054     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02055            "Unexpected FP-extend for return value.");
02056 
02057     // If this is x86-64, and we disabled SSE, we can't return FP values,
02058     // or SSE or MMX vectors.
02059     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02060          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02061           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02062       report_fatal_error("SSE register return with SSE disabled");
02063     }
02064     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02065     // llvm-gcc has never done it right and no one has noticed, so this
02066     // should be OK for now.
02067     if (ValVT == MVT::f64 &&
02068         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02069       report_fatal_error("SSE2 register return with SSE2 disabled");
02070 
02071     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02072     // the RET instruction and handled by the FP Stackifier.
02073     if (VA.getLocReg() == X86::FP0 ||
02074         VA.getLocReg() == X86::FP1) {
02075       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02076       // change the value to the FP stack register class.
02077       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02078         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02079       RetOps.push_back(ValToCopy);
02080       // Don't emit a copytoreg.
02081       continue;
02082     }
02083 
02084     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02085     // which is returned in RAX / RDX.
02086     if (Subtarget->is64Bit()) {
02087       if (ValVT == MVT::x86mmx) {
02088         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02089           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02090           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02091                                   ValToCopy);
02092           // If we don't have SSE2 available, convert to v4f32 so the generated
02093           // register is legal.
02094           if (!Subtarget->hasSSE2())
02095             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02096         }
02097       }
02098     }
02099 
02100     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02101     Flag = Chain.getValue(1);
02102     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02103   }
02104 
02105   // The x86-64 ABIs require that for returning structs by value we copy
02106   // the sret argument into %rax/%eax (depending on ABI) for the return.
02107   // Win32 requires us to put the sret argument to %eax as well.
02108   // We saved the argument into a virtual register in the entry block,
02109   // so now we copy the value out and into %rax/%eax.
02110   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02111       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02112     MachineFunction &MF = DAG.getMachineFunction();
02113     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02114     unsigned Reg = FuncInfo->getSRetReturnReg();
02115     assert(Reg &&
02116            "SRetReturnReg should have been set in LowerFormalArguments().");
02117     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02118 
02119     unsigned RetValReg
02120         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02121           X86::RAX : X86::EAX;
02122     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02123     Flag = Chain.getValue(1);
02124 
02125     // RAX/EAX now acts like a return value.
02126     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02127   }
02128 
02129   RetOps[0] = Chain;  // Update chain.
02130 
02131   // Add the flag if we have it.
02132   if (Flag.getNode())
02133     RetOps.push_back(Flag);
02134 
02135   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02136 }
02137 
02138 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02139   if (N->getNumValues() != 1)
02140     return false;
02141   if (!N->hasNUsesOfValue(1, 0))
02142     return false;
02143 
02144   SDValue TCChain = Chain;
02145   SDNode *Copy = *N->use_begin();
02146   if (Copy->getOpcode() == ISD::CopyToReg) {
02147     // If the copy has a glue operand, we conservatively assume it isn't safe to
02148     // perform a tail call.
02149     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02150       return false;
02151     TCChain = Copy->getOperand(0);
02152   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02153     return false;
02154 
02155   bool HasRet = false;
02156   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02157        UI != UE; ++UI) {
02158     if (UI->getOpcode() != X86ISD::RET_FLAG)
02159       return false;
02160     // If we are returning more than one value, we can definitely
02161     // not make a tail call see PR19530
02162     if (UI->getNumOperands() > 4)
02163       return false;
02164     if (UI->getNumOperands() == 4 &&
02165         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02166       return false;
02167     HasRet = true;
02168   }
02169 
02170   if (!HasRet)
02171     return false;
02172 
02173   Chain = TCChain;
02174   return true;
02175 }
02176 
02177 EVT
02178 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02179                                             ISD::NodeType ExtendKind) const {
02180   MVT ReturnMVT;
02181   // TODO: Is this also valid on 32-bit?
02182   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02183     ReturnMVT = MVT::i8;
02184   else
02185     ReturnMVT = MVT::i32;
02186 
02187   EVT MinVT = getRegisterType(Context, ReturnMVT);
02188   return VT.bitsLT(MinVT) ? MinVT : VT;
02189 }
02190 
02191 /// Lower the result values of a call into the
02192 /// appropriate copies out of appropriate physical registers.
02193 ///
02194 SDValue
02195 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02196                                    CallingConv::ID CallConv, bool isVarArg,
02197                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02198                                    SDLoc dl, SelectionDAG &DAG,
02199                                    SmallVectorImpl<SDValue> &InVals) const {
02200 
02201   // Assign locations to each value returned by this call.
02202   SmallVector<CCValAssign, 16> RVLocs;
02203   bool Is64Bit = Subtarget->is64Bit();
02204   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02205                  *DAG.getContext());
02206   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02207 
02208   // Copy all of the result registers out of their specified physreg.
02209   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02210     CCValAssign &VA = RVLocs[i];
02211     EVT CopyVT = VA.getValVT();
02212 
02213     // If this is x86-64, and we disabled SSE, we can't return FP values
02214     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02215         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02216       report_fatal_error("SSE register return with SSE disabled");
02217     }
02218 
02219     // If we prefer to use the value in xmm registers, copy it out as f80 and
02220     // use a truncate to move it from fp stack reg to xmm reg.
02221     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02222         isScalarFPTypeInSSEReg(VA.getValVT()))
02223       CopyVT = MVT::f80;
02224 
02225     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02226                                CopyVT, InFlag).getValue(1);
02227     SDValue Val = Chain.getValue(0);
02228 
02229     if (CopyVT != VA.getValVT())
02230       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02231                         // This truncation won't change the value.
02232                         DAG.getIntPtrConstant(1));
02233 
02234     InFlag = Chain.getValue(2);
02235     InVals.push_back(Val);
02236   }
02237 
02238   return Chain;
02239 }
02240 
02241 //===----------------------------------------------------------------------===//
02242 //                C & StdCall & Fast Calling Convention implementation
02243 //===----------------------------------------------------------------------===//
02244 //  StdCall calling convention seems to be standard for many Windows' API
02245 //  routines and around. It differs from C calling convention just a little:
02246 //  callee should clean up the stack, not caller. Symbols should be also
02247 //  decorated in some fancy way :) It doesn't support any vector arguments.
02248 //  For info on fast calling convention see Fast Calling Convention (tail call)
02249 //  implementation LowerX86_32FastCCCallTo.
02250 
02251 /// CallIsStructReturn - Determines whether a call uses struct return
02252 /// semantics.
02253 enum StructReturnType {
02254   NotStructReturn,
02255   RegStructReturn,
02256   StackStructReturn
02257 };
02258 static StructReturnType
02259 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02260   if (Outs.empty())
02261     return NotStructReturn;
02262 
02263   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02264   if (!Flags.isSRet())
02265     return NotStructReturn;
02266   if (Flags.isInReg())
02267     return RegStructReturn;
02268   return StackStructReturn;
02269 }
02270 
02271 /// Determines whether a function uses struct return semantics.
02272 static StructReturnType
02273 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02274   if (Ins.empty())
02275     return NotStructReturn;
02276 
02277   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02278   if (!Flags.isSRet())
02279     return NotStructReturn;
02280   if (Flags.isInReg())
02281     return RegStructReturn;
02282   return StackStructReturn;
02283 }
02284 
02285 /// Make a copy of an aggregate at address specified by "Src" to address
02286 /// "Dst" with size and alignment information specified by the specific
02287 /// parameter attribute. The copy will be passed as a byval function parameter.
02288 static SDValue
02289 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02290                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02291                           SDLoc dl) {
02292   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02293 
02294   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02295                        /*isVolatile*/false, /*AlwaysInline=*/true,
02296                        MachinePointerInfo(), MachinePointerInfo());
02297 }
02298 
02299 /// Return true if the calling convention is one that
02300 /// supports tail call optimization.
02301 static bool IsTailCallConvention(CallingConv::ID CC) {
02302   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02303           CC == CallingConv::HiPE);
02304 }
02305 
02306 /// \brief Return true if the calling convention is a C calling convention.
02307 static bool IsCCallConvention(CallingConv::ID CC) {
02308   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02309           CC == CallingConv::X86_64_SysV);
02310 }
02311 
02312 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02313   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02314     return false;
02315 
02316   CallSite CS(CI);
02317   CallingConv::ID CalleeCC = CS.getCallingConv();
02318   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02319     return false;
02320 
02321   return true;
02322 }
02323 
02324 /// Return true if the function is being made into
02325 /// a tailcall target by changing its ABI.
02326 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02327                                    bool GuaranteedTailCallOpt) {
02328   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02329 }
02330 
02331 SDValue
02332 X86TargetLowering::LowerMemArgument(SDValue Chain,
02333                                     CallingConv::ID CallConv,
02334                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02335                                     SDLoc dl, SelectionDAG &DAG,
02336                                     const CCValAssign &VA,
02337                                     MachineFrameInfo *MFI,
02338                                     unsigned i) const {
02339   // Create the nodes corresponding to a load from this parameter slot.
02340   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02341   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02342       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02343   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02344   EVT ValVT;
02345 
02346   // If value is passed by pointer we have address passed instead of the value
02347   // itself.
02348   if (VA.getLocInfo() == CCValAssign::Indirect)
02349     ValVT = VA.getLocVT();
02350   else
02351     ValVT = VA.getValVT();
02352 
02353   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02354   // changed with more analysis.
02355   // In case of tail call optimization mark all arguments mutable. Since they
02356   // could be overwritten by lowering of arguments in case of a tail call.
02357   if (Flags.isByVal()) {
02358     unsigned Bytes = Flags.getByValSize();
02359     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02360     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02361     return DAG.getFrameIndex(FI, getPointerTy());
02362   } else {
02363     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02364                                     VA.getLocMemOffset(), isImmutable);
02365     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02366     return DAG.getLoad(ValVT, dl, Chain, FIN,
02367                        MachinePointerInfo::getFixedStack(FI),
02368                        false, false, false, 0);
02369   }
02370 }
02371 
02372 // FIXME: Get this from tablegen.
02373 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02374                                                 const X86Subtarget *Subtarget) {
02375   assert(Subtarget->is64Bit());
02376 
02377   if (Subtarget->isCallingConvWin64(CallConv)) {
02378     static const MCPhysReg GPR64ArgRegsWin64[] = {
02379       X86::RCX, X86::RDX, X86::R8,  X86::R9
02380     };
02381     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02382   }
02383 
02384   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02385     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02386   };
02387   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02388 }
02389 
02390 // FIXME: Get this from tablegen.
02391 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02392                                                 CallingConv::ID CallConv,
02393                                                 const X86Subtarget *Subtarget) {
02394   assert(Subtarget->is64Bit());
02395   if (Subtarget->isCallingConvWin64(CallConv)) {
02396     // The XMM registers which might contain var arg parameters are shadowed
02397     // in their paired GPR.  So we only need to save the GPR to their home
02398     // slots.
02399     // TODO: __vectorcall will change this.
02400     return None;
02401   }
02402 
02403   const Function *Fn = MF.getFunction();
02404   bool NoImplicitFloatOps = Fn->getAttributes().
02405       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02406   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02407          "SSE register cannot be used when SSE is disabled!");
02408   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02409       !Subtarget->hasSSE1())
02410     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02411     // registers.
02412     return None;
02413 
02414   static const MCPhysReg XMMArgRegs64Bit[] = {
02415     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02416     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02417   };
02418   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02419 }
02420 
02421 SDValue
02422 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02423                                         CallingConv::ID CallConv,
02424                                         bool isVarArg,
02425                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02426                                         SDLoc dl,
02427                                         SelectionDAG &DAG,
02428                                         SmallVectorImpl<SDValue> &InVals)
02429                                           const {
02430   MachineFunction &MF = DAG.getMachineFunction();
02431   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02432 
02433   const Function* Fn = MF.getFunction();
02434   if (Fn->hasExternalLinkage() &&
02435       Subtarget->isTargetCygMing() &&
02436       Fn->getName() == "main")
02437     FuncInfo->setForceFramePointer(true);
02438 
02439   MachineFrameInfo *MFI = MF.getFrameInfo();
02440   bool Is64Bit = Subtarget->is64Bit();
02441   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02442 
02443   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02444          "Var args not supported with calling convention fastcc, ghc or hipe");
02445 
02446   // Assign locations to all of the incoming arguments.
02447   SmallVector<CCValAssign, 16> ArgLocs;
02448   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02449 
02450   // Allocate shadow area for Win64
02451   if (IsWin64)
02452     CCInfo.AllocateStack(32, 8);
02453 
02454   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02455 
02456   unsigned LastVal = ~0U;
02457   SDValue ArgValue;
02458   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02459     CCValAssign &VA = ArgLocs[i];
02460     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02461     // places.
02462     assert(VA.getValNo() != LastVal &&
02463            "Don't support value assigned to multiple locs yet");
02464     (void)LastVal;
02465     LastVal = VA.getValNo();
02466 
02467     if (VA.isRegLoc()) {
02468       EVT RegVT = VA.getLocVT();
02469       const TargetRegisterClass *RC;
02470       if (RegVT == MVT::i32)
02471         RC = &X86::GR32RegClass;
02472       else if (Is64Bit && RegVT == MVT::i64)
02473         RC = &X86::GR64RegClass;
02474       else if (RegVT == MVT::f32)
02475         RC = &X86::FR32RegClass;
02476       else if (RegVT == MVT::f64)
02477         RC = &X86::FR64RegClass;
02478       else if (RegVT.is512BitVector())
02479         RC = &X86::VR512RegClass;
02480       else if (RegVT.is256BitVector())
02481         RC = &X86::VR256RegClass;
02482       else if (RegVT.is128BitVector())
02483         RC = &X86::VR128RegClass;
02484       else if (RegVT == MVT::x86mmx)
02485         RC = &X86::VR64RegClass;
02486       else if (RegVT == MVT::i1)
02487         RC = &X86::VK1RegClass;
02488       else if (RegVT == MVT::v8i1)
02489         RC = &X86::VK8RegClass;
02490       else if (RegVT == MVT::v16i1)
02491         RC = &X86::VK16RegClass;
02492       else if (RegVT == MVT::v32i1)
02493         RC = &X86::VK32RegClass;
02494       else if (RegVT == MVT::v64i1)
02495         RC = &X86::VK64RegClass;
02496       else
02497         llvm_unreachable("Unknown argument type!");
02498 
02499       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02500       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02501 
02502       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02503       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02504       // right size.
02505       if (VA.getLocInfo() == CCValAssign::SExt)
02506         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02507                                DAG.getValueType(VA.getValVT()));
02508       else if (VA.getLocInfo() == CCValAssign::ZExt)
02509         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02510                                DAG.getValueType(VA.getValVT()));
02511       else if (VA.getLocInfo() == CCValAssign::BCvt)
02512         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02513 
02514       if (VA.isExtInLoc()) {
02515         // Handle MMX values passed in XMM regs.
02516         if (RegVT.isVector())
02517           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02518         else
02519           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02520       }
02521     } else {
02522       assert(VA.isMemLoc());
02523       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02524     }
02525 
02526     // If value is passed via pointer - do a load.
02527     if (VA.getLocInfo() == CCValAssign::Indirect)
02528       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02529                              MachinePointerInfo(), false, false, false, 0);
02530 
02531     InVals.push_back(ArgValue);
02532   }
02533 
02534   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02535     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02536       // The x86-64 ABIs require that for returning structs by value we copy
02537       // the sret argument into %rax/%eax (depending on ABI) for the return.
02538       // Win32 requires us to put the sret argument to %eax as well.
02539       // Save the argument into a virtual register so that we can access it
02540       // from the return points.
02541       if (Ins[i].Flags.isSRet()) {
02542         unsigned Reg = FuncInfo->getSRetReturnReg();
02543         if (!Reg) {
02544           MVT PtrTy = getPointerTy();
02545           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02546           FuncInfo->setSRetReturnReg(Reg);
02547         }
02548         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02549         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02550         break;
02551       }
02552     }
02553   }
02554 
02555   unsigned StackSize = CCInfo.getNextStackOffset();
02556   // Align stack specially for tail calls.
02557   if (FuncIsMadeTailCallSafe(CallConv,
02558                              MF.getTarget().Options.GuaranteedTailCallOpt))
02559     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02560 
02561   // If the function takes variable number of arguments, make a frame index for
02562   // the start of the first vararg value... for expansion of llvm.va_start. We
02563   // can skip this if there are no va_start calls.
02564   if (MFI->hasVAStart() &&
02565       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02566                    CallConv != CallingConv::X86_ThisCall))) {
02567     FuncInfo->setVarArgsFrameIndex(
02568         MFI->CreateFixedObject(1, StackSize, true));
02569   }
02570 
02571   // Figure out if XMM registers are in use.
02572   assert(!(MF.getTarget().Options.UseSoftFloat &&
02573            Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
02574                                             Attribute::NoImplicitFloat)) &&
02575          "SSE register cannot be used when SSE is disabled!");
02576 
02577   // 64-bit calling conventions support varargs and register parameters, so we
02578   // have to do extra work to spill them in the prologue.
02579   if (Is64Bit && isVarArg && MFI->hasVAStart()) {
02580     // Find the first unallocated argument registers.
02581     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02582     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02583     unsigned NumIntRegs =
02584         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02585     unsigned NumXMMRegs =
02586         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02587     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02588            "SSE register cannot be used when SSE is disabled!");
02589 
02590     // Gather all the live in physical registers.
02591     SmallVector<SDValue, 6> LiveGPRs;
02592     SmallVector<SDValue, 8> LiveXMMRegs;
02593     SDValue ALVal;
02594     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02595       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02596       LiveGPRs.push_back(
02597           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02598     }
02599     if (!ArgXMMs.empty()) {
02600       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02601       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02602       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02603         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02604         LiveXMMRegs.push_back(
02605             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02606       }
02607     }
02608 
02609     if (IsWin64) {
02610       const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02611       // Get to the caller-allocated home save location.  Add 8 to account
02612       // for the return address.
02613       int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02614       FuncInfo->setRegSaveFrameIndex(
02615           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02616       // Fixup to set vararg frame on shadow area (4 x i64).
02617       if (NumIntRegs < 4)
02618         FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02619     } else {
02620       // For X86-64, if there are vararg parameters that are passed via
02621       // registers, then we must store them to their spots on the stack so
02622       // they may be loaded by deferencing the result of va_next.
02623       FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02624       FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02625       FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02626           ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02627     }
02628 
02629     // Store the integer parameter registers.
02630     SmallVector<SDValue, 8> MemOps;
02631     SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02632                                       getPointerTy());
02633     unsigned Offset = FuncInfo->getVarArgsGPOffset();
02634     for (SDValue Val : LiveGPRs) {
02635       SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02636                                 DAG.getIntPtrConstant(Offset));
02637       SDValue Store =
02638         DAG.getStore(Val.getValue(1), dl, Val, FIN,
02639                      MachinePointerInfo::getFixedStack(
02640                        FuncInfo->getRegSaveFrameIndex(), Offset),
02641                      false, false, 0);
02642       MemOps.push_back(Store);
02643       Offset += 8;
02644     }
02645 
02646     if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02647       // Now store the XMM (fp + vector) parameter registers.
02648       SmallVector<SDValue, 12> SaveXMMOps;
02649       SaveXMMOps.push_back(Chain);
02650       SaveXMMOps.push_back(ALVal);
02651       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02652                              FuncInfo->getRegSaveFrameIndex()));
02653       SaveXMMOps.push_back(DAG.getIntPtrConstant(
02654                              FuncInfo->getVarArgsFPOffset()));
02655       SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02656                         LiveXMMRegs.end());
02657       MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02658                                    MVT::Other, SaveXMMOps));
02659     }
02660 
02661     if (!MemOps.empty())
02662       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02663   }
02664 
02665   if (isVarArg && MFI->hasMustTailInVarArgFunc()) {
02666     // Find the largest legal vector type.
02667     MVT VecVT = MVT::Other;
02668     // FIXME: Only some x86_32 calling conventions support AVX512.
02669     if (Subtarget->hasAVX512() &&
02670         (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
02671                      CallConv == CallingConv::Intel_OCL_BI)))
02672       VecVT = MVT::v16f32;
02673     else if (Subtarget->hasAVX())
02674       VecVT = MVT::v8f32;
02675     else if (Subtarget->hasSSE2())
02676       VecVT = MVT::v4f32;
02677 
02678     // We forward some GPRs and some vector types.
02679     SmallVector<MVT, 2> RegParmTypes;
02680     MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
02681     RegParmTypes.push_back(IntVT);
02682     if (VecVT != MVT::Other)
02683       RegParmTypes.push_back(VecVT);
02684 
02685     // Compute the set of forwarded registers. The rest are scratch.
02686     SmallVectorImpl<ForwardedRegister> &Forwards =
02687         FuncInfo->getForwardedMustTailRegParms();
02688     CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
02689 
02690     // Conservatively forward AL on x86_64, since it might be used for varargs.
02691     if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
02692       unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02693       Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
02694     }
02695 
02696     // Copy all forwards from physical to virtual registers.
02697     for (ForwardedRegister &F : Forwards) {
02698       // FIXME: Can we use a less constrained schedule?
02699       SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02700       F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
02701       Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
02702     }
02703   }
02704 
02705   // Some CCs need callee pop.
02706   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02707                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02708     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02709   } else {
02710     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02711     // If this is an sret function, the return should pop the hidden pointer.
02712     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02713         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02714         argsAreStructReturn(Ins) == StackStructReturn)
02715       FuncInfo->setBytesToPopOnReturn(4);
02716   }
02717 
02718   if (!Is64Bit) {
02719     // RegSaveFrameIndex is X86-64 only.
02720     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02721     if (CallConv == CallingConv::X86_FastCall ||
02722         CallConv == CallingConv::X86_ThisCall)
02723       // fastcc functions can't have varargs.
02724       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02725   }
02726 
02727   FuncInfo->setArgumentStackSize(StackSize);
02728 
02729   return Chain;
02730 }
02731 
02732 SDValue
02733 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02734                                     SDValue StackPtr, SDValue Arg,
02735                                     SDLoc dl, SelectionDAG &DAG,
02736                                     const CCValAssign &VA,
02737                                     ISD::ArgFlagsTy Flags) const {
02738   unsigned LocMemOffset = VA.getLocMemOffset();
02739   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02740   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02741   if (Flags.isByVal())
02742     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02743 
02744   return DAG.getStore(Chain, dl, Arg, PtrOff,
02745                       MachinePointerInfo::getStack(LocMemOffset),
02746                       false, false, 0);
02747 }
02748 
02749 /// Emit a load of return address if tail call
02750 /// optimization is performed and it is required.
02751 SDValue
02752 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02753                                            SDValue &OutRetAddr, SDValue Chain,
02754                                            bool IsTailCall, bool Is64Bit,
02755                                            int FPDiff, SDLoc dl) const {
02756   // Adjust the Return address stack slot.
02757   EVT VT = getPointerTy();
02758   OutRetAddr = getReturnAddressFrameIndex(DAG);
02759 
02760   // Load the "old" Return address.
02761   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02762                            false, false, false, 0);
02763   return SDValue(OutRetAddr.getNode(), 1);
02764 }
02765 
02766 /// Emit a store of the return address if tail call
02767 /// optimization is performed and it is required (FPDiff!=0).
02768 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02769                                         SDValue Chain, SDValue RetAddrFrIdx,
02770                                         EVT PtrVT, unsigned SlotSize,
02771                                         int FPDiff, SDLoc dl) {
02772   // Store the return address to the appropriate stack slot.
02773   if (!FPDiff) return Chain;
02774   // Calculate the new stack slot for the return address.
02775   int NewReturnAddrFI =
02776     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02777                                          false);
02778   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02779   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02780                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02781                        false, false, 0);
02782   return Chain;
02783 }
02784 
02785 SDValue
02786 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02787                              SmallVectorImpl<SDValue> &InVals) const {
02788   SelectionDAG &DAG                     = CLI.DAG;
02789   SDLoc &dl                             = CLI.DL;
02790   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02791   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02792   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02793   SDValue Chain                         = CLI.Chain;
02794   SDValue Callee                        = CLI.Callee;
02795   CallingConv::ID CallConv              = CLI.CallConv;
02796   bool &isTailCall                      = CLI.IsTailCall;
02797   bool isVarArg                         = CLI.IsVarArg;
02798 
02799   MachineFunction &MF = DAG.getMachineFunction();
02800   bool Is64Bit        = Subtarget->is64Bit();
02801   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02802   StructReturnType SR = callIsStructReturn(Outs);
02803   bool IsSibcall      = false;
02804   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02805 
02806   if (MF.getTarget().Options.DisableTailCalls)
02807     isTailCall = false;
02808 
02809   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02810   if (IsMustTail) {
02811     // Force this to be a tail call.  The verifier rules are enough to ensure
02812     // that we can lower this successfully without moving the return address
02813     // around.
02814     isTailCall = true;
02815   } else if (isTailCall) {
02816     // Check if it's really possible to do a tail call.
02817     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02818                     isVarArg, SR != NotStructReturn,
02819                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02820                     Outs, OutVals, Ins, DAG);
02821 
02822     // Sibcalls are automatically detected tailcalls which do not require
02823     // ABI changes.
02824     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02825       IsSibcall = true;
02826 
02827     if (isTailCall)
02828       ++NumTailCalls;
02829   }
02830 
02831   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02832          "Var args not supported with calling convention fastcc, ghc or hipe");
02833 
02834   // Analyze operands of the call, assigning locations to each operand.
02835   SmallVector<CCValAssign, 16> ArgLocs;
02836   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02837 
02838   // Allocate shadow area for Win64
02839   if (IsWin64)
02840     CCInfo.AllocateStack(32, 8);
02841 
02842   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02843 
02844   // Get a count of how many bytes are to be pushed on the stack.
02845   unsigned NumBytes = CCInfo.getNextStackOffset();
02846   if (IsSibcall)
02847     // This is a sibcall. The memory operands are available in caller's
02848     // own caller's stack.
02849     NumBytes = 0;
02850   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02851            IsTailCallConvention(CallConv))
02852     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02853 
02854   int FPDiff = 0;
02855   if (isTailCall && !IsSibcall && !IsMustTail) {
02856     // Lower arguments at fp - stackoffset + fpdiff.
02857     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02858 
02859     FPDiff = NumBytesCallerPushed - NumBytes;
02860 
02861     // Set the delta of movement of the returnaddr stackslot.
02862     // But only set if delta is greater than previous delta.
02863     if (FPDiff < X86Info->getTCReturnAddrDelta())
02864       X86Info->setTCReturnAddrDelta(FPDiff);
02865   }
02866 
02867   unsigned NumBytesToPush = NumBytes;
02868   unsigned NumBytesToPop = NumBytes;
02869 
02870   // If we have an inalloca argument, all stack space has already been allocated
02871   // for us and be right at the top of the stack.  We don't support multiple
02872   // arguments passed in memory when using inalloca.
02873   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02874     NumBytesToPush = 0;
02875     if (!ArgLocs.back().isMemLoc())
02876       report_fatal_error("cannot use inalloca attribute on a register "
02877                          "parameter");
02878     if (ArgLocs.back().getLocMemOffset() != 0)
02879       report_fatal_error("any parameter with the inalloca attribute must be "
02880                          "the only memory argument");
02881   }
02882 
02883   if (!IsSibcall)
02884     Chain = DAG.getCALLSEQ_START(
02885         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02886 
02887   SDValue RetAddrFrIdx;
02888   // Load return address for tail calls.
02889   if (isTailCall && FPDiff)
02890     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02891                                     Is64Bit, FPDiff, dl);
02892 
02893   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02894   SmallVector<SDValue, 8> MemOpChains;
02895   SDValue StackPtr;
02896 
02897   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02898   // of tail call optimization arguments are handle later.
02899   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02900       DAG.getSubtarget().getRegisterInfo());
02901   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02902     // Skip inalloca arguments, they have already been written.
02903     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02904     if (Flags.isInAlloca())
02905       continue;
02906 
02907     CCValAssign &VA = ArgLocs[i];
02908     EVT RegVT = VA.getLocVT();
02909     SDValue Arg = OutVals[i];
02910     bool isByVal = Flags.isByVal();
02911 
02912     // Promote the value if needed.
02913     switch (VA.getLocInfo()) {
02914     default: llvm_unreachable("Unknown loc info!");
02915     case CCValAssign::Full: break;
02916     case CCValAssign::SExt:
02917       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02918       break;
02919     case CCValAssign::ZExt:
02920       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02921       break;
02922     case CCValAssign::AExt:
02923       if (RegVT.is128BitVector()) {
02924         // Special case: passing MMX values in XMM registers.
02925         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02926         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02927         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02928       } else
02929         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02930       break;
02931     case CCValAssign::BCvt:
02932       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02933       break;
02934     case CCValAssign::Indirect: {
02935       // Store the argument.
02936       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02937       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02938       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02939                            MachinePointerInfo::getFixedStack(FI),
02940                            false, false, 0);
02941       Arg = SpillSlot;
02942       break;
02943     }
02944     }
02945 
02946     if (VA.isRegLoc()) {
02947       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02948       if (isVarArg && IsWin64) {
02949         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02950         // shadow reg if callee is a varargs function.
02951         unsigned ShadowReg = 0;
02952         switch (VA.getLocReg()) {
02953         case X86::XMM0: ShadowReg = X86::RCX; break;
02954         case X86::XMM1: ShadowReg = X86::RDX; break;
02955         case X86::XMM2: ShadowReg = X86::R8; break;
02956         case X86::XMM3: ShadowReg = X86::R9; break;
02957         }
02958         if (ShadowReg)
02959           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02960       }
02961     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02962       assert(VA.isMemLoc());
02963       if (!StackPtr.getNode())
02964         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02965                                       getPointerTy());
02966       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02967                                              dl, DAG, VA, Flags));
02968     }
02969   }
02970 
02971   if (!MemOpChains.empty())
02972     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02973 
02974   if (Subtarget->isPICStyleGOT()) {
02975     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02976     // GOT pointer.
02977     if (!isTailCall) {
02978       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02979                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02980     } else {
02981       // If we are tail calling and generating PIC/GOT style code load the
02982       // address of the callee into ECX. The value in ecx is used as target of
02983       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02984       // for tail calls on PIC/GOT architectures. Normally we would just put the
02985       // address of GOT into ebx and then call target@PLT. But for tail calls
02986       // ebx would be restored (since ebx is callee saved) before jumping to the
02987       // target@PLT.
02988 
02989       // Note: The actual moving to ECX is done further down.
02990       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02991       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02992           !G->getGlobal()->hasProtectedVisibility())
02993         Callee = LowerGlobalAddress(Callee, DAG);
02994       else if (isa<ExternalSymbolSDNode>(Callee))
02995         Callee = LowerExternalSymbol(Callee, DAG);
02996     }
02997   }
02998 
02999   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
03000     // From AMD64 ABI document:
03001     // For calls that may call functions that use varargs or stdargs
03002     // (prototype-less calls or calls to functions containing ellipsis (...) in
03003     // the declaration) %al is used as hidden argument to specify the number
03004     // of SSE registers used. The contents of %al do not need to match exactly
03005     // the number of registers, but must be an ubound on the number of SSE
03006     // registers used and is in the range 0 - 8 inclusive.
03007 
03008     // Count the number of XMM registers allocated.
03009     static const MCPhysReg XMMArgRegs[] = {
03010       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
03011       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
03012     };
03013     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
03014     assert((Subtarget->hasSSE1() || !NumXMMRegs)
03015            && "SSE registers cannot be used when SSE is disabled");
03016 
03017     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
03018                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
03019   }
03020 
03021   if (isVarArg && IsMustTail) {
03022     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
03023     for (const auto &F : Forwards) {
03024       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
03025       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
03026     }
03027   }
03028 
03029   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
03030   // don't need this because the eligibility check rejects calls that require
03031   // shuffling arguments passed in memory.
03032   if (!IsSibcall && isTailCall) {
03033     // Force all the incoming stack arguments to be loaded from the stack
03034     // before any new outgoing arguments are stored to the stack, because the
03035     // outgoing stack slots may alias the incoming argument stack slots, and
03036     // the alias isn't otherwise explicit. This is slightly more conservative
03037     // than necessary, because it means that each store effectively depends
03038     // on every argument instead of just those arguments it would clobber.
03039     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
03040 
03041     SmallVector<SDValue, 8> MemOpChains2;
03042     SDValue FIN;
03043     int FI = 0;
03044     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03045       CCValAssign &VA = ArgLocs[i];
03046       if (VA.isRegLoc())
03047         continue;
03048       assert(VA.isMemLoc());
03049       SDValue Arg = OutVals[i];
03050       ISD::ArgFlagsTy Flags = Outs[i].Flags;
03051       // Skip inalloca arguments.  They don't require any work.
03052       if (Flags.isInAlloca())
03053         continue;
03054       // Create frame index.
03055       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03056       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03057       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03058       FIN = DAG.getFrameIndex(FI, getPointerTy());
03059 
03060       if (Flags.isByVal()) {
03061         // Copy relative to framepointer.
03062         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03063         if (!StackPtr.getNode())
03064           StackPtr = DAG.getCopyFromReg(Chain, dl,
03065                                         RegInfo->getStackRegister(),
03066                                         getPointerTy());
03067         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03068 
03069         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03070                                                          ArgChain,
03071                                                          Flags, DAG, dl));
03072       } else {
03073         // Store relative to framepointer.
03074         MemOpChains2.push_back(
03075           DAG.getStore(ArgChain, dl, Arg, FIN,
03076                        MachinePointerInfo::getFixedStack(FI),
03077                        false, false, 0));
03078       }
03079     }
03080 
03081     if (!MemOpChains2.empty())
03082       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03083 
03084     // Store the return address to the appropriate stack slot.
03085     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03086                                      getPointerTy(), RegInfo->getSlotSize(),
03087                                      FPDiff, dl);
03088   }
03089 
03090   // Build a sequence of copy-to-reg nodes chained together with token chain
03091   // and flag operands which copy the outgoing args into registers.
03092   SDValue InFlag;
03093   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03094     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03095                              RegsToPass[i].second, InFlag);
03096     InFlag = Chain.getValue(1);
03097   }
03098 
03099   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03100     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03101     // In the 64-bit large code model, we have to make all calls
03102     // through a register, since the call instruction's 32-bit
03103     // pc-relative offset may not be large enough to hold the whole
03104     // address.
03105   } else if (Callee->getOpcode() == ISD::GlobalAddress) {
03106     // If the callee is a GlobalAddress node (quite common, every direct call
03107     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03108     // it.
03109     GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
03110 
03111     // We should use extra load for direct calls to dllimported functions in
03112     // non-JIT mode.
03113     const GlobalValue *GV = G->getGlobal();
03114     if (!GV->hasDLLImportStorageClass()) {
03115       unsigned char OpFlags = 0;
03116       bool ExtraLoad = false;
03117       unsigned WrapperKind = ISD::DELETED_NODE;
03118 
03119       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03120       // external symbols most go through the PLT in PIC mode.  If the symbol
03121       // has hidden or protected visibility, or if it is static or local, then
03122       // we don't need to use the PLT - we can directly call it.
03123       if (Subtarget->isTargetELF() &&
03124           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03125           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03126         OpFlags = X86II::MO_PLT;
03127       } else if (Subtarget->isPICStyleStubAny() &&
03128                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03129                  (!Subtarget->getTargetTriple().isMacOSX() ||
03130                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03131         // PC-relative references to external symbols should go through $stub,
03132         // unless we're building with the leopard linker or later, which
03133         // automatically synthesizes these stubs.
03134         OpFlags = X86II::MO_DARWIN_STUB;
03135       } else if (Subtarget->isPICStyleRIPRel() &&
03136                  isa<Function>(GV) &&
03137                  cast<Function>(GV)->getAttributes().
03138                    hasAttribute(AttributeSet::FunctionIndex,
03139                                 Attribute::NonLazyBind)) {
03140         // If the function is marked as non-lazy, generate an indirect call
03141         // which loads from the GOT directly. This avoids runtime overhead
03142         // at the cost of eager binding (and one extra byte of encoding).
03143         OpFlags = X86II::MO_GOTPCREL;
03144         WrapperKind = X86ISD::WrapperRIP;
03145         ExtraLoad = true;
03146       }
03147 
03148       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03149                                           G->getOffset(), OpFlags);
03150 
03151       // Add a wrapper if needed.
03152       if (WrapperKind != ISD::DELETED_NODE)
03153         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03154       // Add extra indirection if needed.
03155       if (ExtraLoad)
03156         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03157                              MachinePointerInfo::getGOT(),
03158                              false, false, false, 0);
03159     }
03160   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03161     unsigned char OpFlags = 0;
03162 
03163     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03164     // external symbols should go through the PLT.
03165     if (Subtarget->isTargetELF() &&
03166         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03167       OpFlags = X86II::MO_PLT;
03168     } else if (Subtarget->isPICStyleStubAny() &&
03169                (!Subtarget->getTargetTriple().isMacOSX() ||
03170                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03171       // PC-relative references to external symbols should go through $stub,
03172       // unless we're building with the leopard linker or later, which
03173       // automatically synthesizes these stubs.
03174       OpFlags = X86II::MO_DARWIN_STUB;
03175     }
03176 
03177     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03178                                          OpFlags);
03179   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
03180     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03181     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03182   }
03183 
03184   // Returns a chain & a flag for retval copy to use.
03185   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03186   SmallVector<SDValue, 8> Ops;
03187 
03188   if (!IsSibcall && isTailCall) {
03189     Chain = DAG.getCALLSEQ_END(Chain,
03190                                DAG.getIntPtrConstant(NumBytesToPop, true),
03191                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03192     InFlag = Chain.getValue(1);
03193   }
03194 
03195   Ops.push_back(Chain);
03196   Ops.push_back(Callee);
03197 
03198   if (isTailCall)
03199     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03200 
03201   // Add argument registers to the end of the list so that they are known live
03202   // into the call.
03203   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03204     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03205                                   RegsToPass[i].second.getValueType()));
03206 
03207   // Add a register mask operand representing the call-preserved registers.
03208   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03209   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03210   assert(Mask && "Missing call preserved mask for calling convention");
03211   Ops.push_back(DAG.getRegisterMask(Mask));
03212 
03213   if (InFlag.getNode())
03214     Ops.push_back(InFlag);
03215 
03216   if (isTailCall) {
03217     // We used to do:
03218     //// If this is the first return lowered for this function, add the regs
03219     //// to the liveout set for the function.
03220     // This isn't right, although it's probably harmless on x86; liveouts
03221     // should be computed from returns not tail calls.  Consider a void
03222     // function making a tail call to a function returning int.
03223     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03224   }
03225 
03226   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03227   InFlag = Chain.getValue(1);
03228 
03229   // Create the CALLSEQ_END node.
03230   unsigned NumBytesForCalleeToPop;
03231   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03232                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03233     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03234   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03235            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03236            SR == StackStructReturn)
03237     // If this is a call to a struct-return function, the callee
03238     // pops the hidden struct pointer, so we have to push it back.
03239     // This is common for Darwin/X86, Linux & Mingw32 targets.
03240     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03241     NumBytesForCalleeToPop = 4;
03242   else
03243     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03244 
03245   // Returns a flag for retval copy to use.
03246   if (!IsSibcall) {
03247     Chain = DAG.getCALLSEQ_END(Chain,
03248                                DAG.getIntPtrConstant(NumBytesToPop, true),
03249                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03250                                                      true),
03251                                InFlag, dl);
03252     InFlag = Chain.getValue(1);
03253   }
03254 
03255   // Handle result values, copying them out of physregs into vregs that we
03256   // return.
03257   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03258                          Ins, dl, DAG, InVals);
03259 }
03260 
03261 //===----------------------------------------------------------------------===//
03262 //                Fast Calling Convention (tail call) implementation
03263 //===----------------------------------------------------------------------===//
03264 
03265 //  Like std call, callee cleans arguments, convention except that ECX is
03266 //  reserved for storing the tail called function address. Only 2 registers are
03267 //  free for argument passing (inreg). Tail call optimization is performed
03268 //  provided:
03269 //                * tailcallopt is enabled
03270 //                * caller/callee are fastcc
03271 //  On X86_64 architecture with GOT-style position independent code only local
03272 //  (within module) calls are supported at the moment.
03273 //  To keep the stack aligned according to platform abi the function
03274 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03275 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03276 //  If a tail called function callee has more arguments than the caller the
03277 //  caller needs to make sure that there is room to move the RETADDR to. This is
03278 //  achieved by reserving an area the size of the argument delta right after the
03279 //  original RETADDR, but before the saved framepointer or the spilled registers
03280 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03281 //  stack layout:
03282 //    arg1
03283 //    arg2
03284 //    RETADDR
03285 //    [ new RETADDR
03286 //      move area ]
03287 //    (possible EBP)
03288 //    ESI
03289 //    EDI
03290 //    local1 ..
03291 
03292 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03293 /// for a 16 byte align requirement.
03294 unsigned
03295 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03296                                                SelectionDAG& DAG) const {
03297   MachineFunction &MF = DAG.getMachineFunction();
03298   const TargetMachine &TM = MF.getTarget();
03299   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03300       TM.getSubtargetImpl()->getRegisterInfo());
03301   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03302   unsigned StackAlignment = TFI.getStackAlignment();
03303   uint64_t AlignMask = StackAlignment - 1;
03304   int64_t Offset = StackSize;
03305   unsigned SlotSize = RegInfo->getSlotSize();
03306   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03307     // Number smaller than 12 so just add the difference.
03308     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03309   } else {
03310     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03311     Offset = ((~AlignMask) & Offset) + StackAlignment +
03312       (StackAlignment-SlotSize);
03313   }
03314   return Offset;
03315 }
03316 
03317 /// MatchingStackOffset - Return true if the given stack call argument is
03318 /// already available in the same position (relatively) of the caller's
03319 /// incoming argument stack.
03320 static
03321 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03322                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03323                          const X86InstrInfo *TII) {
03324   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03325   int FI = INT_MAX;
03326   if (Arg.getOpcode() == ISD::CopyFromReg) {
03327     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03328     if (!TargetRegisterInfo::isVirtualRegister(VR))
03329       return false;
03330     MachineInstr *Def = MRI->getVRegDef(VR);
03331     if (!Def)
03332       return false;
03333     if (!Flags.isByVal()) {
03334       if (!TII->isLoadFromStackSlot(Def, FI))
03335         return false;
03336     } else {
03337       unsigned Opcode = Def->getOpcode();
03338       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
03339            Opcode == X86::LEA64_32r) &&
03340           Def->getOperand(1).isFI()) {
03341         FI = Def->getOperand(1).getIndex();
03342         Bytes = Flags.getByValSize();
03343       } else
03344         return false;
03345     }
03346   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03347     if (Flags.isByVal())
03348       // ByVal argument is passed in as a pointer but it's now being
03349       // dereferenced. e.g.
03350       // define @foo(%struct.X* %A) {
03351       //   tail call @bar(%struct.X* byval %A)
03352       // }
03353       return false;
03354     SDValue Ptr = Ld->getBasePtr();
03355     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03356     if (!FINode)
03357       return false;
03358     FI = FINode->getIndex();
03359   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03360     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03361     FI = FINode->getIndex();
03362     Bytes = Flags.getByValSize();
03363   } else
03364     return false;
03365 
03366   assert(FI != INT_MAX);
03367   if (!MFI->isFixedObjectIndex(FI))
03368     return false;
03369   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03370 }
03371 
03372 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03373 /// for tail call optimization. Targets which want to do tail call
03374 /// optimization should implement this function.
03375 bool
03376 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03377                                                      CallingConv::ID CalleeCC,
03378                                                      bool isVarArg,
03379                                                      bool isCalleeStructRet,
03380                                                      bool isCallerStructRet,
03381                                                      Type *RetTy,
03382                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03383                                     const SmallVectorImpl<SDValue> &OutVals,
03384                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03385                                                      SelectionDAG &DAG) const {
03386   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03387     return false;
03388 
03389   // If -tailcallopt is specified, make fastcc functions tail-callable.
03390   const MachineFunction &MF = DAG.getMachineFunction();
03391   const Function *CallerF = MF.getFunction();
03392 
03393   // If the function return type is x86_fp80 and the callee return type is not,
03394   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03395   // perform a tailcall optimization here.
03396   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03397     return false;
03398 
03399   CallingConv::ID CallerCC = CallerF->getCallingConv();
03400   bool CCMatch = CallerCC == CalleeCC;
03401   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03402   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03403 
03404   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03405     if (IsTailCallConvention(CalleeCC) && CCMatch)
03406       return true;
03407     return false;
03408   }
03409 
03410   // Look for obvious safe cases to perform tail call optimization that do not
03411   // require ABI changes. This is what gcc calls sibcall.
03412 
03413   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03414   // emit a special epilogue.
03415   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03416       DAG.getSubtarget().getRegisterInfo());
03417   if (RegInfo->needsStackRealignment(MF))
03418     return false;
03419 
03420   // Also avoid sibcall optimization if either caller or callee uses struct
03421   // return semantics.
03422   if (isCalleeStructRet || isCallerStructRet)
03423     return false;
03424 
03425   // An stdcall/thiscall caller is expected to clean up its arguments; the
03426   // callee isn't going to do that.
03427   // FIXME: this is more restrictive than needed. We could produce a tailcall
03428   // when the stack adjustment matches. For example, with a thiscall that takes
03429   // only one argument.
03430   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03431                    CallerCC == CallingConv::X86_ThisCall))
03432     return false;
03433 
03434   // Do not sibcall optimize vararg calls unless all arguments are passed via
03435   // registers.
03436   if (isVarArg && !Outs.empty()) {
03437 
03438     // Optimizing for varargs on Win64 is unlikely to be safe without
03439     // additional testing.
03440     if (IsCalleeWin64 || IsCallerWin64)
03441       return false;
03442 
03443     SmallVector<CCValAssign, 16> ArgLocs;
03444     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03445                    *DAG.getContext());
03446 
03447     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03448     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03449       if (!ArgLocs[i].isRegLoc())
03450         return false;
03451   }
03452 
03453   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03454   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03455   // this into a sibcall.
03456   bool Unused = false;
03457   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03458     if (!Ins[i].Used) {
03459       Unused = true;
03460       break;
03461     }
03462   }
03463   if (Unused) {
03464     SmallVector<CCValAssign, 16> RVLocs;
03465     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03466                    *DAG.getContext());
03467     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03468     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03469       CCValAssign &VA = RVLocs[i];
03470       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03471         return false;
03472     }
03473   }
03474 
03475   // If the calling conventions do not match, then we'd better make sure the
03476   // results are returned in the same way as what the caller expects.
03477   if (!CCMatch) {
03478     SmallVector<CCValAssign, 16> RVLocs1;
03479     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03480                     *DAG.getContext());
03481     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03482 
03483     SmallVector<CCValAssign, 16> RVLocs2;
03484     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03485                     *DAG.getContext());
03486     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03487 
03488     if (RVLocs1.size() != RVLocs2.size())
03489       return false;
03490     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03491       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03492         return false;
03493       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03494         return false;
03495       if (RVLocs1[i].isRegLoc()) {
03496         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03497           return false;
03498       } else {
03499         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03500           return false;
03501       }
03502     }
03503   }
03504 
03505   // If the callee takes no arguments then go on to check the results of the
03506   // call.
03507   if (!Outs.empty()) {
03508     // Check if stack adjustment is needed. For now, do not do this if any
03509     // argument is passed on the stack.
03510     SmallVector<CCValAssign, 16> ArgLocs;
03511     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03512                    *DAG.getContext());
03513 
03514     // Allocate shadow area for Win64
03515     if (IsCalleeWin64)
03516       CCInfo.AllocateStack(32, 8);
03517 
03518     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03519     if (CCInfo.getNextStackOffset()) {
03520       MachineFunction &MF = DAG.getMachineFunction();
03521       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03522         return false;
03523 
03524       // Check if the arguments are already laid out in the right way as
03525       // the caller's fixed stack objects.
03526       MachineFrameInfo *MFI = MF.getFrameInfo();
03527       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03528       const X86InstrInfo *TII =
03529           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03530       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03531         CCValAssign &VA = ArgLocs[i];
03532         SDValue Arg = OutVals[i];
03533         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03534         if (VA.getLocInfo() == CCValAssign::Indirect)
03535           return false;
03536         if (!VA.isRegLoc()) {
03537           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03538                                    MFI, MRI, TII))
03539             return false;
03540         }
03541       }
03542     }
03543 
03544     // If the tailcall address may be in a register, then make sure it's
03545     // possible to register allocate for it. In 32-bit, the call address can
03546     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03547     // callee-saved registers are restored. These happen to be the same
03548     // registers used to pass 'inreg' arguments so watch out for those.
03549     if (!Subtarget->is64Bit() &&
03550         ((!isa<GlobalAddressSDNode>(Callee) &&
03551           !isa<ExternalSymbolSDNode>(Callee)) ||
03552          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03553       unsigned NumInRegs = 0;
03554       // In PIC we need an extra register to formulate the address computation
03555       // for the callee.
03556       unsigned MaxInRegs =
03557         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03558 
03559       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03560         CCValAssign &VA = ArgLocs[i];
03561         if (!VA.isRegLoc())
03562           continue;
03563         unsigned Reg = VA.getLocReg();
03564         switch (Reg) {
03565         default: break;
03566         case X86::EAX: case X86::EDX: case X86::ECX:
03567           if (++NumInRegs == MaxInRegs)
03568             return false;
03569           break;
03570         }
03571       }
03572     }
03573   }
03574 
03575   return true;
03576 }
03577 
03578 FastISel *
03579 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03580                                   const TargetLibraryInfo *libInfo) const {
03581   return X86::createFastISel(funcInfo, libInfo);
03582 }
03583 
03584 //===----------------------------------------------------------------------===//
03585 //                           Other Lowering Hooks
03586 //===----------------------------------------------------------------------===//
03587 
03588 static bool MayFoldLoad(SDValue Op) {
03589   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03590 }
03591 
03592 static bool MayFoldIntoStore(SDValue Op) {
03593   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03594 }
03595 
03596 static bool isTargetShuffle(unsigned Opcode) {
03597   switch(Opcode) {
03598   default: return false;
03599   case X86ISD::BLENDI:
03600   case X86ISD::PSHUFB:
03601   case X86ISD::PSHUFD:
03602   case X86ISD::PSHUFHW:
03603   case X86ISD::PSHUFLW:
03604   case X86ISD::SHUFP:
03605   case X86ISD::PALIGNR:
03606   case X86ISD::MOVLHPS:
03607   case X86ISD::MOVLHPD:
03608   case X86ISD::MOVHLPS:
03609   case X86ISD::MOVLPS:
03610   case X86ISD::MOVLPD:
03611   case X86ISD::MOVSHDUP:
03612   case X86ISD::MOVSLDUP:
03613   case X86ISD::MOVDDUP:
03614   case X86ISD::MOVSS:
03615   case X86ISD::MOVSD:
03616   case X86ISD::UNPCKL:
03617   case X86ISD::UNPCKH:
03618   case X86ISD::VPERMILPI:
03619   case X86ISD::VPERM2X128:
03620   case X86ISD::VPERMI:
03621     return true;
03622   }
03623 }
03624 
03625 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03626                                     SDValue V1, SelectionDAG &DAG) {
03627   switch(Opc) {
03628   default: llvm_unreachable("Unknown x86 shuffle node");
03629   case X86ISD::MOVSHDUP:
03630   case X86ISD::MOVSLDUP:
03631   case X86ISD::MOVDDUP:
03632     return DAG.getNode(Opc, dl, VT, V1);
03633   }
03634 }
03635 
03636 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03637                                     SDValue V1, unsigned TargetMask,
03638                                     SelectionDAG &DAG) {
03639   switch(Opc) {
03640   default: llvm_unreachable("Unknown x86 shuffle node");
03641   case X86ISD::PSHUFD:
03642   case X86ISD::PSHUFHW:
03643   case X86ISD::PSHUFLW:
03644   case X86ISD::VPERMILPI:
03645   case X86ISD::VPERMI:
03646     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03647   }
03648 }
03649 
03650 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03651                                     SDValue V1, SDValue V2, unsigned TargetMask,
03652                                     SelectionDAG &DAG) {
03653   switch(Opc) {
03654   default: llvm_unreachable("Unknown x86 shuffle node");
03655   case X86ISD::PALIGNR:
03656   case X86ISD::VALIGN:
03657   case X86ISD::SHUFP:
03658   case X86ISD::VPERM2X128:
03659     return DAG.getNode(Opc, dl, VT, V1, V2,
03660                        DAG.getConstant(TargetMask, MVT::i8));
03661   }
03662 }
03663 
03664 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03665                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03666   switch(Opc) {
03667   default: llvm_unreachable("Unknown x86 shuffle node");
03668   case X86ISD::MOVLHPS:
03669   case X86ISD::MOVLHPD:
03670   case X86ISD::MOVHLPS:
03671   case X86ISD::MOVLPS:
03672   case X86ISD::MOVLPD:
03673   case X86ISD::MOVSS:
03674   case X86ISD::MOVSD:
03675   case X86ISD::UNPCKL:
03676   case X86ISD::UNPCKH:
03677     return DAG.getNode(Opc, dl, VT, V1, V2);
03678   }
03679 }
03680 
03681 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03682   MachineFunction &MF = DAG.getMachineFunction();
03683   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03684       DAG.getSubtarget().getRegisterInfo());
03685   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03686   int ReturnAddrIndex = FuncInfo->getRAIndex();
03687 
03688   if (ReturnAddrIndex == 0) {
03689     // Set up a frame object for the return address.
03690     unsigned SlotSize = RegInfo->getSlotSize();
03691     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03692                                                            -(int64_t)SlotSize,
03693                                                            false);
03694     FuncInfo->setRAIndex(ReturnAddrIndex);
03695   }
03696 
03697   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03698 }
03699 
03700 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03701                                        bool hasSymbolicDisplacement) {
03702   // Offset should fit into 32 bit immediate field.
03703   if (!isInt<32>(Offset))
03704     return false;
03705 
03706   // If we don't have a symbolic displacement - we don't have any extra
03707   // restrictions.
03708   if (!hasSymbolicDisplacement)
03709     return true;
03710 
03711   // FIXME: Some tweaks might be needed for medium code model.
03712   if (M != CodeModel::Small && M != CodeModel::Kernel)
03713     return false;
03714 
03715   // For small code model we assume that latest object is 16MB before end of 31
03716   // bits boundary. We may also accept pretty large negative constants knowing
03717   // that all objects are in the positive half of address space.
03718   if (M == CodeModel::Small && Offset < 16*1024*1024)
03719     return true;
03720 
03721   // For kernel code model we know that all object resist in the negative half
03722   // of 32bits address space. We may not accept negative offsets, since they may
03723   // be just off and we may accept pretty large positive ones.
03724   if (M == CodeModel::Kernel && Offset >= 0)
03725     return true;
03726 
03727   return false;
03728 }
03729 
03730 /// isCalleePop - Determines whether the callee is required to pop its
03731 /// own arguments. Callee pop is necessary to support tail calls.
03732 bool X86::isCalleePop(CallingConv::ID CallingConv,
03733                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03734   switch (CallingConv) {
03735   default:
03736     return false;
03737   case CallingConv::X86_StdCall:
03738   case CallingConv::X86_FastCall:
03739   case CallingConv::X86_ThisCall:
03740     return !is64Bit;
03741   case CallingConv::Fast:
03742   case CallingConv::GHC:
03743   case CallingConv::HiPE:
03744     if (IsVarArg)
03745       return false;
03746     return TailCallOpt;
03747   }
03748 }
03749 
03750 /// \brief Return true if the condition is an unsigned comparison operation.
03751 static bool isX86CCUnsigned(unsigned X86CC) {
03752   switch (X86CC) {
03753   default: llvm_unreachable("Invalid integer condition!");
03754   case X86::COND_E:     return true;
03755   case X86::COND_G:     return false;
03756   case X86::COND_GE:    return false;
03757   case X86::COND_L:     return false;
03758   case X86::COND_LE:    return false;
03759   case X86::COND_NE:    return true;
03760   case X86::COND_B:     return true;
03761   case X86::COND_A:     return true;
03762   case X86::COND_BE:    return true;
03763   case X86::COND_AE:    return true;
03764   }
03765   llvm_unreachable("covered switch fell through?!");
03766 }
03767 
03768 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03769 /// specific condition code, returning the condition code and the LHS/RHS of the
03770 /// comparison to make.
03771 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03772                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03773   if (!isFP) {
03774     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03775       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03776         // X > -1   -> X == 0, jump !sign.
03777         RHS = DAG.getConstant(0, RHS.getValueType());
03778         return X86::COND_NS;
03779       }
03780       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03781         // X < 0   -> X == 0, jump on sign.
03782         return X86::COND_S;
03783       }
03784       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03785         // X < 1   -> X <= 0
03786         RHS = DAG.getConstant(0, RHS.getValueType());
03787         return X86::COND_LE;
03788       }
03789     }
03790 
03791     switch (SetCCOpcode) {
03792     default: llvm_unreachable("Invalid integer condition!");
03793     case ISD::SETEQ:  return X86::COND_E;
03794     case ISD::SETGT:  return X86::COND_G;
03795     case ISD::SETGE:  return X86::COND_GE;
03796     case ISD::SETLT:  return X86::COND_L;
03797     case ISD::SETLE:  return X86::COND_LE;
03798     case ISD::SETNE:  return X86::COND_NE;
03799     case ISD::SETULT: return X86::COND_B;
03800     case ISD::SETUGT: return X86::COND_A;
03801     case ISD::SETULE: return X86::COND_BE;
03802     case ISD::SETUGE: return X86::COND_AE;
03803     }
03804   }
03805 
03806   // First determine if it is required or is profitable to flip the operands.
03807 
03808   // If LHS is a foldable load, but RHS is not, flip the condition.
03809   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03810       !ISD::isNON_EXTLoad(RHS.getNode())) {
03811     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03812     std::swap(LHS, RHS);
03813   }
03814 
03815   switch (SetCCOpcode) {
03816   default: break;
03817   case ISD::SETOLT:
03818   case ISD::SETOLE:
03819   case ISD::SETUGT:
03820   case ISD::SETUGE:
03821     std::swap(LHS, RHS);
03822     break;
03823   }
03824 
03825   // On a floating point condition, the flags are set as follows:
03826   // ZF  PF  CF   op
03827   //  0 | 0 | 0 | X > Y
03828   //  0 | 0 | 1 | X < Y
03829   //  1 | 0 | 0 | X == Y
03830   //  1 | 1 | 1 | unordered
03831   switch (SetCCOpcode) {
03832   default: llvm_unreachable("Condcode should be pre-legalized away");
03833   case ISD::SETUEQ:
03834   case ISD::SETEQ:   return X86::COND_E;
03835   case ISD::SETOLT:              // flipped
03836   case ISD::SETOGT:
03837   case ISD::SETGT:   return X86::COND_A;
03838   case ISD::SETOLE:              // flipped
03839   case ISD::SETOGE:
03840   case ISD::SETGE:   return X86::COND_AE;
03841   case ISD::SETUGT:              // flipped
03842   case ISD::SETULT:
03843   case ISD::SETLT:   return X86::COND_B;
03844   case ISD::SETUGE:              // flipped
03845   case ISD::SETULE:
03846   case ISD::SETLE:   return X86::COND_BE;
03847   case ISD::SETONE:
03848   case ISD::SETNE:   return X86::COND_NE;
03849   case ISD::SETUO:   return X86::COND_P;
03850   case ISD::SETO:    return X86::COND_NP;
03851   case ISD::SETOEQ:
03852   case ISD::SETUNE:  return X86::COND_INVALID;
03853   }
03854 }
03855 
03856 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03857 /// code. Current x86 isa includes the following FP cmov instructions:
03858 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03859 static bool hasFPCMov(unsigned X86CC) {
03860   switch (X86CC) {
03861   default:
03862     return false;
03863   case X86::COND_B:
03864   case X86::COND_BE:
03865   case X86::COND_E:
03866   case X86::COND_P:
03867   case X86::COND_A:
03868   case X86::COND_AE:
03869   case X86::COND_NE:
03870   case X86::COND_NP:
03871     return true;
03872   }
03873 }
03874 
03875 /// isFPImmLegal - Returns true if the target can instruction select the
03876 /// specified FP immediate natively. If false, the legalizer will
03877 /// materialize the FP immediate as a load from a constant pool.
03878 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03879   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03880     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03881       return true;
03882   }
03883   return false;
03884 }
03885 
03886 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
03887                                               ISD::LoadExtType ExtTy,
03888                                               EVT NewVT) const {
03889   // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
03890   // relocation target a movq or addq instruction: don't let the load shrink.
03891   SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
03892   if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
03893     if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
03894       return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
03895   return true;
03896 }
03897 
03898 /// \brief Returns true if it is beneficial to convert a load of a constant
03899 /// to just the constant itself.
03900 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03901                                                           Type *Ty) const {
03902   assert(Ty->isIntegerTy());
03903 
03904   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03905   if (BitSize == 0 || BitSize > 64)
03906     return false;
03907   return true;
03908 }
03909 
03910 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
03911                                                 unsigned Index) const {
03912   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
03913     return false;
03914 
03915   return (Index == 0 || Index == ResVT.getVectorNumElements());
03916 }
03917 
03918 bool X86TargetLowering::isCheapToSpeculateCttz() const {
03919   // Speculate cttz only if we can directly use TZCNT.
03920   return Subtarget->hasBMI();
03921 }
03922 
03923 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
03924   // Speculate ctlz only if we can directly use LZCNT.
03925   return Subtarget->hasLZCNT();
03926 }
03927 
03928 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03929 /// the specified range (L, H].
03930 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03931   return (Val < 0) || (Val >= Low && Val < Hi);
03932 }
03933 
03934 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03935 /// specified value.
03936 static bool isUndefOrEqual(int Val, int CmpVal) {
03937   return (Val < 0 || Val == CmpVal);
03938 }
03939 
03940 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03941 /// from position Pos and ending in Pos+Size, falls within the specified
03942 /// sequential range (Low, Low+Size]. or is undef.
03943 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03944                                        unsigned Pos, unsigned Size, int Low) {
03945   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03946     if (!isUndefOrEqual(Mask[i], Low))
03947       return false;
03948   return true;
03949 }
03950 
03951 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03952 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
03953 /// operand - by default will match for first operand.
03954 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
03955                          bool TestSecondOperand = false) {
03956   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
03957       VT != MVT::v2f64 && VT != MVT::v2i64)
03958     return false;
03959 
03960   unsigned NumElems = VT.getVectorNumElements();
03961   unsigned Lo = TestSecondOperand ? NumElems : 0;
03962   unsigned Hi = Lo + NumElems;
03963 
03964   for (unsigned i = 0; i < NumElems; ++i)
03965     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
03966       return false;
03967 
03968   return true;
03969 }
03970 
03971 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03972 /// is suitable for input to PSHUFHW.
03973 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03974   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03975     return false;
03976 
03977   // Lower quadword copied in order or undef.
03978   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03979     return false;
03980 
03981   // Upper quadword shuffled.
03982   for (unsigned i = 4; i != 8; ++i)
03983     if (!isUndefOrInRange(Mask[i], 4, 8))
03984       return false;
03985 
03986   if (VT == MVT::v16i16) {
03987     // Lower quadword copied in order or undef.
03988     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03989       return false;
03990 
03991     // Upper quadword shuffled.
03992     for (unsigned i = 12; i != 16; ++i)
03993       if (!isUndefOrInRange(Mask[i], 12, 16))
03994         return false;
03995   }
03996 
03997   return true;
03998 }
03999 
04000 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
04001 /// is suitable for input to PSHUFLW.
04002 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04003   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
04004     return false;
04005 
04006   // Upper quadword copied in order.
04007   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
04008     return false;
04009 
04010   // Lower quadword shuffled.
04011   for (unsigned i = 0; i != 4; ++i)
04012     if (!isUndefOrInRange(Mask[i], 0, 4))
04013       return false;
04014 
04015   if (VT == MVT::v16i16) {
04016     // Upper quadword copied in order.
04017     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
04018       return false;
04019 
04020     // Lower quadword shuffled.
04021     for (unsigned i = 8; i != 12; ++i)
04022       if (!isUndefOrInRange(Mask[i], 8, 12))
04023         return false;
04024   }
04025 
04026   return true;
04027 }
04028 
04029 /// \brief Return true if the mask specifies a shuffle of elements that is
04030 /// suitable for input to intralane (palignr) or interlane (valign) vector
04031 /// right-shift.
04032 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
04033   unsigned NumElts = VT.getVectorNumElements();
04034   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
04035   unsigned NumLaneElts = NumElts/NumLanes;
04036 
04037   // Do not handle 64-bit element shuffles with palignr.
04038   if (NumLaneElts == 2)
04039     return false;
04040 
04041   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
04042     unsigned i;
04043     for (i = 0; i != NumLaneElts; ++i) {
04044       if (Mask[i+l] >= 0)
04045         break;
04046     }
04047 
04048     // Lane is all undef, go to next lane
04049     if (i == NumLaneElts)
04050       continue;
04051 
04052     int Start = Mask[i+l];
04053 
04054     // Make sure its in this lane in one of the sources
04055     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
04056         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
04057       return false;
04058 
04059     // If not lane 0, then we must match lane 0
04060     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
04061       return false;
04062 
04063     // Correct second source to be contiguous with first source
04064     if (Start >= (int)NumElts)
04065       Start -= NumElts - NumLaneElts;
04066 
04067     // Make sure we're shifting in the right direction.
04068     if (Start <= (int)(i+l))
04069       return false;
04070 
04071     Start -= i;
04072 
04073     // Check the rest of the elements to see if they are consecutive.
04074     for (++i; i != NumLaneElts; ++i) {
04075       int Idx = Mask[i+l];
04076 
04077       // Make sure its in this lane
04078       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
04079           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
04080         return false;
04081 
04082       // If not lane 0, then we must match lane 0
04083       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
04084         return false;
04085 
04086       if (Idx >= (int)NumElts)
04087         Idx -= NumElts - NumLaneElts;
04088 
04089       if (!isUndefOrEqual(Idx, Start+i))
04090         return false;
04091 
04092     }
04093   }
04094 
04095   return true;
04096 }
04097 
04098 /// \brief Return true if the node specifies a shuffle of elements that is
04099 /// suitable for input to PALIGNR.
04100 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04101                           const X86Subtarget *Subtarget) {
04102   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04103       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04104       VT.is512BitVector())
04105     // FIXME: Add AVX512BW.
04106     return false;
04107 
04108   return isAlignrMask(Mask, VT, false);
04109 }
04110 
04111 /// \brief Return true if the node specifies a shuffle of elements that is
04112 /// suitable for input to VALIGN.
04113 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04114                           const X86Subtarget *Subtarget) {
04115   // FIXME: Add AVX512VL.
04116   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04117     return false;
04118   return isAlignrMask(Mask, VT, true);
04119 }
04120 
04121 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04122 /// the two vector operands have swapped position.
04123 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04124                                      unsigned NumElems) {
04125   for (unsigned i = 0; i != NumElems; ++i) {
04126     int idx = Mask[i];
04127     if (idx < 0)
04128       continue;
04129     else if (idx < (int)NumElems)
04130       Mask[i] = idx + NumElems;
04131     else
04132       Mask[i] = idx - NumElems;
04133   }
04134 }
04135 
04136 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04137 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04138 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04139 /// reverse of what x86 shuffles want.
04140 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04141 
04142   unsigned NumElems = VT.getVectorNumElements();
04143   unsigned NumLanes = VT.getSizeInBits()/128;
04144   unsigned NumLaneElems = NumElems/NumLanes;
04145 
04146   if (NumLaneElems != 2 && NumLaneElems != 4)
04147     return false;
04148 
04149   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04150   bool symetricMaskRequired =
04151     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04152 
04153   // VSHUFPSY divides the resulting vector into 4 chunks.
04154   // The sources are also splitted into 4 chunks, and each destination
04155   // chunk must come from a different source chunk.
04156   //
04157   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04158   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04159   //
04160   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04161   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04162   //
04163   // VSHUFPDY divides the resulting vector into 4 chunks.
04164   // The sources are also splitted into 4 chunks, and each destination
04165   // chunk must come from a different source chunk.
04166   //
04167   //  SRC1 =>      X3       X2       X1       X0
04168   //  SRC2 =>      Y3       Y2       Y1       Y0
04169   //
04170   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04171   //
04172   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04173   unsigned HalfLaneElems = NumLaneElems/2;
04174   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04175     for (unsigned i = 0; i != NumLaneElems; ++i) {
04176       int Idx = Mask[i+l];
04177       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04178       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04179         return false;
04180       // For VSHUFPSY, the mask of the second half must be the same as the
04181       // first but with the appropriate offsets. This works in the same way as
04182       // VPERMILPS works with masks.
04183       if (!symetricMaskRequired || Idx < 0)
04184         continue;
04185       if (MaskVal[i] < 0) {
04186         MaskVal[i] = Idx - l;
04187         continue;
04188       }
04189       if ((signed)(Idx - l) != MaskVal[i])
04190         return false;
04191     }
04192   }
04193 
04194   return true;
04195 }
04196 
04197 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04198 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04199 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04200   if (!VT.is128BitVector())
04201     return false;
04202 
04203   unsigned NumElems = VT.getVectorNumElements();
04204 
04205   if (NumElems != 4)
04206     return false;
04207 
04208   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04209   return isUndefOrEqual(Mask[0], 6) &&
04210          isUndefOrEqual(Mask[1], 7) &&
04211          isUndefOrEqual(Mask[2], 2) &&
04212          isUndefOrEqual(Mask[3], 3);
04213 }
04214 
04215 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04216 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04217 /// <2, 3, 2, 3>
04218 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04219   if (!VT.is128BitVector())
04220     return false;
04221 
04222   unsigned NumElems = VT.getVectorNumElements();
04223 
04224   if (NumElems != 4)
04225     return false;
04226 
04227   return isUndefOrEqual(Mask[0], 2) &&
04228          isUndefOrEqual(Mask[1], 3) &&
04229          isUndefOrEqual(Mask[2], 2) &&
04230          isUndefOrEqual(Mask[3], 3);
04231 }
04232 
04233 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04234 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04235 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04236   if (!VT.is128BitVector())
04237     return false;
04238 
04239   unsigned NumElems = VT.getVectorNumElements();
04240 
04241   if (NumElems != 2 && NumElems != 4)
04242     return false;
04243 
04244   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04245     if (!isUndefOrEqual(Mask[i], i + NumElems))
04246       return false;
04247 
04248   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04249     if (!isUndefOrEqual(Mask[i], i))
04250       return false;
04251 
04252   return true;
04253 }
04254 
04255 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04256 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04257 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04258   if (!VT.is128BitVector())
04259     return false;
04260 
04261   unsigned NumElems = VT.getVectorNumElements();
04262 
04263   if (NumElems != 2 && NumElems != 4)
04264     return false;
04265 
04266   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04267     if (!isUndefOrEqual(Mask[i], i))
04268       return false;
04269 
04270   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04271     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04272       return false;
04273 
04274   return true;
04275 }
04276 
04277 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04278 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04279 /// i. e: If all but one element come from the same vector.
04280 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04281   // TODO: Deal with AVX's VINSERTPS
04282   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04283     return false;
04284 
04285   unsigned CorrectPosV1 = 0;
04286   unsigned CorrectPosV2 = 0;
04287   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04288     if (Mask[i] == -1) {
04289       ++CorrectPosV1;
04290       ++CorrectPosV2;
04291       continue;
04292     }
04293 
04294     if (Mask[i] == i)
04295       ++CorrectPosV1;
04296     else if (Mask[i] == i + 4)
04297       ++CorrectPosV2;
04298   }
04299 
04300   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04301     // We have 3 elements (undefs count as elements from any vector) from one
04302     // vector, and one from another.
04303     return true;
04304 
04305   return false;
04306 }
04307 
04308 //
04309 // Some special combinations that can be optimized.
04310 //
04311 static
04312 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04313                                SelectionDAG &DAG) {
04314   MVT VT = SVOp->getSimpleValueType(0);
04315   SDLoc dl(SVOp);
04316 
04317   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04318     return SDValue();
04319 
04320   ArrayRef<int> Mask = SVOp->getMask();
04321 
04322   // These are the special masks that may be optimized.
04323   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04324   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04325   bool MatchEvenMask = true;
04326   bool MatchOddMask  = true;
04327   for (int i=0; i<8; ++i) {
04328     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04329       MatchEvenMask = false;
04330     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04331       MatchOddMask = false;
04332   }
04333 
04334   if (!MatchEvenMask && !MatchOddMask)
04335     return SDValue();
04336 
04337   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04338 
04339   SDValue Op0 = SVOp->getOperand(0);
04340   SDValue Op1 = SVOp->getOperand(1);
04341 
04342   if (MatchEvenMask) {
04343     // Shift the second operand right to 32 bits.
04344     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04345     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04346   } else {
04347     // Shift the first operand left to 32 bits.
04348     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04349     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04350   }
04351   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04352   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04353 }
04354 
04355 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04356 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04357 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04358                          bool HasInt256, bool V2IsSplat = false) {
04359 
04360   assert(VT.getSizeInBits() >= 128 &&
04361          "Unsupported vector type for unpckl");
04362 
04363   unsigned NumElts = VT.getVectorNumElements();
04364   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04365       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04366     return false;
04367 
04368   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04369          "Unsupported vector type for unpckh");
04370 
04371   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04372   unsigned NumLanes = VT.getSizeInBits()/128;
04373   unsigned NumLaneElts = NumElts/NumLanes;
04374 
04375   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04376     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04377       int BitI  = Mask[l+i];
04378       int BitI1 = Mask[l+i+1];
04379       if (!isUndefOrEqual(BitI, j))
04380         return false;
04381       if (V2IsSplat) {
04382         if (!isUndefOrEqual(BitI1, NumElts))
04383           return false;
04384       } else {
04385         if (!isUndefOrEqual(BitI1, j + NumElts))
04386           return false;
04387       }
04388     }
04389   }
04390 
04391   return true;
04392 }
04393 
04394 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04395 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04396 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04397                          bool HasInt256, bool V2IsSplat = false) {
04398   assert(VT.getSizeInBits() >= 128 &&
04399          "Unsupported vector type for unpckh");
04400 
04401   unsigned NumElts = VT.getVectorNumElements();
04402   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04403       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04404     return false;
04405 
04406   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04407          "Unsupported vector type for unpckh");
04408 
04409   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04410   unsigned NumLanes = VT.getSizeInBits()/128;
04411   unsigned NumLaneElts = NumElts/NumLanes;
04412 
04413   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04414     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04415       int BitI  = Mask[l+i];
04416       int BitI1 = Mask[l+i+1];
04417       if (!isUndefOrEqual(BitI, j))
04418         return false;
04419       if (V2IsSplat) {
04420         if (isUndefOrEqual(BitI1, NumElts))
04421           return false;
04422       } else {
04423         if (!isUndefOrEqual(BitI1, j+NumElts))
04424           return false;
04425       }
04426     }
04427   }
04428   return true;
04429 }
04430 
04431 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04432 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04433 /// <0, 0, 1, 1>
04434 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04435   unsigned NumElts = VT.getVectorNumElements();
04436   bool Is256BitVec = VT.is256BitVector();
04437 
04438   if (VT.is512BitVector())
04439     return false;
04440   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04441          "Unsupported vector type for unpckh");
04442 
04443   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04444       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04445     return false;
04446 
04447   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04448   // FIXME: Need a better way to get rid of this, there's no latency difference
04449   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04450   // the former later. We should also remove the "_undef" special mask.
04451   if (NumElts == 4 && Is256BitVec)
04452     return false;
04453 
04454   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04455   // independently on 128-bit lanes.
04456   unsigned NumLanes = VT.getSizeInBits()/128;
04457   unsigned NumLaneElts = NumElts/NumLanes;
04458 
04459   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04460     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04461       int BitI  = Mask[l+i];
04462       int BitI1 = Mask[l+i+1];
04463 
04464       if (!isUndefOrEqual(BitI, j))
04465         return false;
04466       if (!isUndefOrEqual(BitI1, j))
04467         return false;
04468     }
04469   }
04470 
04471   return true;
04472 }
04473 
04474 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04475 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04476 /// <2, 2, 3, 3>
04477 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04478   unsigned NumElts = VT.getVectorNumElements();
04479 
04480   if (VT.is512BitVector())
04481     return false;
04482 
04483   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04484          "Unsupported vector type for unpckh");
04485 
04486   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04487       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04488     return false;
04489 
04490   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04491   // independently on 128-bit lanes.
04492   unsigned NumLanes = VT.getSizeInBits()/128;
04493   unsigned NumLaneElts = NumElts/NumLanes;
04494 
04495   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04496     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04497       int BitI  = Mask[l+i];
04498       int BitI1 = Mask[l+i+1];
04499       if (!isUndefOrEqual(BitI, j))
04500         return false;
04501       if (!isUndefOrEqual(BitI1, j))
04502         return false;
04503     }
04504   }
04505   return true;
04506 }
04507 
04508 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04509 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04510 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04511   if (!VT.is512BitVector())
04512     return false;
04513 
04514   unsigned NumElts = VT.getVectorNumElements();
04515   unsigned HalfSize = NumElts/2;
04516   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04517     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04518       *Imm = 1;
04519       return true;
04520     }
04521   }
04522   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04523     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04524       *Imm = 0;
04525       return true;
04526     }
04527   }
04528   return false;
04529 }
04530 
04531 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04532 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04533 /// MOVSD, and MOVD, i.e. setting the lowest element.
04534 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04535   if (VT.getVectorElementType().getSizeInBits() < 32)
04536     return false;
04537   if (!VT.is128BitVector())
04538     return false;
04539 
04540   unsigned NumElts = VT.getVectorNumElements();
04541 
04542   if (!isUndefOrEqual(Mask[0], NumElts))
04543     return false;
04544 
04545   for (unsigned i = 1; i != NumElts; ++i)
04546     if (!isUndefOrEqual(Mask[i], i))
04547       return false;
04548 
04549   return true;
04550 }
04551 
04552 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04553 /// as permutations between 128-bit chunks or halves. As an example: this
04554 /// shuffle bellow:
04555 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04556 /// The first half comes from the second half of V1 and the second half from the
04557 /// the second half of V2.
04558 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04559   if (!HasFp256 || !VT.is256BitVector())
04560     return false;
04561 
04562   // The shuffle result is divided into half A and half B. In total the two
04563   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04564   // B must come from C, D, E or F.
04565   unsigned HalfSize = VT.getVectorNumElements()/2;
04566   bool MatchA = false, MatchB = false;
04567 
04568   // Check if A comes from one of C, D, E, F.
04569   for (unsigned Half = 0; Half != 4; ++Half) {
04570     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04571       MatchA = true;
04572       break;
04573     }
04574   }
04575 
04576   // Check if B comes from one of C, D, E, F.
04577   for (unsigned Half = 0; Half != 4; ++Half) {
04578     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04579       MatchB = true;
04580       break;
04581     }
04582   }
04583 
04584   return MatchA && MatchB;
04585 }
04586 
04587 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04588 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04589 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04590   MVT VT = SVOp->getSimpleValueType(0);
04591 
04592   unsigned HalfSize = VT.getVectorNumElements()/2;
04593 
04594   unsigned FstHalf = 0, SndHalf = 0;
04595   for (unsigned i = 0; i < HalfSize; ++i) {
04596     if (SVOp->getMaskElt(i) > 0) {
04597       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04598       break;
04599     }
04600   }
04601   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04602     if (SVOp->getMaskElt(i) > 0) {
04603       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04604       break;
04605     }
04606   }
04607 
04608   return (FstHalf | (SndHalf << 4));
04609 }
04610 
04611 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04612 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04613   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04614   if (EltSize < 32)
04615     return false;
04616 
04617   unsigned NumElts = VT.getVectorNumElements();
04618   Imm8 = 0;
04619   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04620     for (unsigned i = 0; i != NumElts; ++i) {
04621       if (Mask[i] < 0)
04622         continue;
04623       Imm8 |= Mask[i] << (i*2);
04624     }
04625     return true;
04626   }
04627 
04628   unsigned LaneSize = 4;
04629   SmallVector<int, 4> MaskVal(LaneSize, -1);
04630 
04631   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04632     for (unsigned i = 0; i != LaneSize; ++i) {
04633       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04634         return false;
04635       if (Mask[i+l] < 0)
04636         continue;
04637       if (MaskVal[i] < 0) {
04638         MaskVal[i] = Mask[i+l] - l;
04639         Imm8 |= MaskVal[i] << (i*2);
04640         continue;
04641       }
04642       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04643         return false;
04644     }
04645   }
04646   return true;
04647 }
04648 
04649 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04650 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04651 /// Note that VPERMIL mask matching is different depending whether theunderlying
04652 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04653 /// to the same elements of the low, but to the higher half of the source.
04654 /// In VPERMILPD the two lanes could be shuffled independently of each other
04655 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04656 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04657   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04658   if (VT.getSizeInBits() < 256 || EltSize < 32)
04659     return false;
04660   bool symetricMaskRequired = (EltSize == 32);
04661   unsigned NumElts = VT.getVectorNumElements();
04662 
04663   unsigned NumLanes = VT.getSizeInBits()/128;
04664   unsigned LaneSize = NumElts/NumLanes;
04665   // 2 or 4 elements in one lane
04666 
04667   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04668   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04669     for (unsigned i = 0; i != LaneSize; ++i) {
04670       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04671         return false;
04672       if (symetricMaskRequired) {
04673         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04674           ExpectedMaskVal[i] = Mask[i+l] - l;
04675           continue;
04676         }
04677         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04678           return false;
04679       }
04680     }
04681   }
04682   return true;
04683 }
04684 
04685 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04686 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04687 /// element of vector 2 and the other elements to come from vector 1 in order.
04688 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04689                                bool V2IsSplat = false, bool V2IsUndef = false) {
04690   if (!VT.is128BitVector())
04691     return false;
04692 
04693   unsigned NumOps = VT.getVectorNumElements();
04694   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04695     return false;
04696 
04697   if (!isUndefOrEqual(Mask[0], 0))
04698     return false;
04699 
04700   for (unsigned i = 1; i != NumOps; ++i)
04701     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04702           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04703           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04704       return false;
04705 
04706   return true;
04707 }
04708 
04709 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04710 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04711 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04712 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04713                            const X86Subtarget *Subtarget) {
04714   if (!Subtarget->hasSSE3())
04715     return false;
04716 
04717   unsigned NumElems = VT.getVectorNumElements();
04718 
04719   if ((VT.is128BitVector() && NumElems != 4) ||
04720       (VT.is256BitVector() && NumElems != 8) ||
04721       (VT.is512BitVector() && NumElems != 16))
04722     return false;
04723 
04724   // "i+1" is the value the indexed mask element must have
04725   for (unsigned i = 0; i != NumElems; i += 2)
04726     if (!isUndefOrEqual(Mask[i], i+1) ||
04727         !isUndefOrEqual(Mask[i+1], i+1))
04728       return false;
04729 
04730   return true;
04731 }
04732 
04733 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04734 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04735 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04736 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04737                            const X86Subtarget *Subtarget) {
04738   if (!Subtarget->hasSSE3())
04739     return false;
04740 
04741   unsigned NumElems = VT.getVectorNumElements();
04742 
04743   if ((VT.is128BitVector() && NumElems != 4) ||
04744       (VT.is256BitVector() && NumElems != 8) ||
04745       (VT.is512BitVector() && NumElems != 16))
04746     return false;
04747 
04748   // "i" is the value the indexed mask element must have
04749   for (unsigned i = 0; i != NumElems; i += 2)
04750     if (!isUndefOrEqual(Mask[i], i) ||
04751         !isUndefOrEqual(Mask[i+1], i))
04752       return false;
04753 
04754   return true;
04755 }
04756 
04757 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04758 /// specifies a shuffle of elements that is suitable for input to 256-bit
04759 /// version of MOVDDUP.
04760 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04761   if (!HasFp256 || !VT.is256BitVector())
04762     return false;
04763 
04764   unsigned NumElts = VT.getVectorNumElements();
04765   if (NumElts != 4)
04766     return false;
04767 
04768   for (unsigned i = 0; i != NumElts/2; ++i)
04769     if (!isUndefOrEqual(Mask[i], 0))
04770       return false;
04771   for (unsigned i = NumElts/2; i != NumElts; ++i)
04772     if (!isUndefOrEqual(Mask[i], NumElts/2))
04773       return false;
04774   return true;
04775 }
04776 
04777 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04778 /// specifies a shuffle of elements that is suitable for input to 128-bit
04779 /// version of MOVDDUP.
04780 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04781   if (!VT.is128BitVector())
04782     return false;
04783 
04784   unsigned e = VT.getVectorNumElements() / 2;
04785   for (unsigned i = 0; i != e; ++i)
04786     if (!isUndefOrEqual(Mask[i], i))
04787       return false;
04788   for (unsigned i = 0; i != e; ++i)
04789     if (!isUndefOrEqual(Mask[e+i], i))
04790       return false;
04791   return true;
04792 }
04793 
04794 /// isVEXTRACTIndex - Return true if the specified
04795 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04796 /// suitable for instruction that extract 128 or 256 bit vectors
04797 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04798   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04799   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04800     return false;
04801 
04802   // The index should be aligned on a vecWidth-bit boundary.
04803   uint64_t Index =
04804     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04805 
04806   MVT VT = N->getSimpleValueType(0);
04807   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04808   bool Result = (Index * ElSize) % vecWidth == 0;
04809 
04810   return Result;
04811 }
04812 
04813 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04814 /// operand specifies a subvector insert that is suitable for input to
04815 /// insertion of 128 or 256-bit subvectors
04816 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04817   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04818   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04819     return false;
04820   // The index should be aligned on a vecWidth-bit boundary.
04821   uint64_t Index =
04822     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04823 
04824   MVT VT = N->getSimpleValueType(0);
04825   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04826   bool Result = (Index * ElSize) % vecWidth == 0;
04827 
04828   return Result;
04829 }
04830 
04831 bool X86::isVINSERT128Index(SDNode *N) {
04832   return isVINSERTIndex(N, 128);
04833 }
04834 
04835 bool X86::isVINSERT256Index(SDNode *N) {
04836   return isVINSERTIndex(N, 256);
04837 }
04838 
04839 bool X86::isVEXTRACT128Index(SDNode *N) {
04840   return isVEXTRACTIndex(N, 128);
04841 }
04842 
04843 bool X86::isVEXTRACT256Index(SDNode *N) {
04844   return isVEXTRACTIndex(N, 256);
04845 }
04846 
04847 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04848 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04849 /// Handles 128-bit and 256-bit.
04850 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04851   MVT VT = N->getSimpleValueType(0);
04852 
04853   assert((VT.getSizeInBits() >= 128) &&
04854          "Unsupported vector type for PSHUF/SHUFP");
04855 
04856   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04857   // independently on 128-bit lanes.
04858   unsigned NumElts = VT.getVectorNumElements();
04859   unsigned NumLanes = VT.getSizeInBits()/128;
04860   unsigned NumLaneElts = NumElts/NumLanes;
04861 
04862   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04863          "Only supports 2, 4 or 8 elements per lane");
04864 
04865   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04866   unsigned Mask = 0;
04867   for (unsigned i = 0; i != NumElts; ++i) {
04868     int Elt = N->getMaskElt(i);
04869     if (Elt < 0) continue;
04870     Elt &= NumLaneElts - 1;
04871     unsigned ShAmt = (i << Shift) % 8;
04872     Mask |= Elt << ShAmt;
04873   }
04874 
04875   return Mask;
04876 }
04877 
04878 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04879 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04880 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04881   MVT VT = N->getSimpleValueType(0);
04882 
04883   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04884          "Unsupported vector type for PSHUFHW");
04885 
04886   unsigned NumElts = VT.getVectorNumElements();
04887 
04888   unsigned Mask = 0;
04889   for (unsigned l = 0; l != NumElts; l += 8) {
04890     // 8 nodes per lane, but we only care about the last 4.
04891     for (unsigned i = 0; i < 4; ++i) {
04892       int Elt = N->getMaskElt(l+i+4);
04893       if (Elt < 0) continue;
04894       Elt &= 0x3; // only 2-bits.
04895       Mask |= Elt << (i * 2);
04896     }
04897   }
04898 
04899   return Mask;
04900 }
04901 
04902 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04903 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04904 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04905   MVT VT = N->getSimpleValueType(0);
04906 
04907   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04908          "Unsupported vector type for PSHUFHW");
04909 
04910   unsigned NumElts = VT.getVectorNumElements();
04911 
04912   unsigned Mask = 0;
04913   for (unsigned l = 0; l != NumElts; l += 8) {
04914     // 8 nodes per lane, but we only care about the first 4.
04915     for (unsigned i = 0; i < 4; ++i) {
04916       int Elt = N->getMaskElt(l+i);
04917       if (Elt < 0) continue;
04918       Elt &= 0x3; // only 2-bits
04919       Mask |= Elt << (i * 2);
04920     }
04921   }
04922 
04923   return Mask;
04924 }
04925 
04926 /// \brief Return the appropriate immediate to shuffle the specified
04927 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04928 /// VALIGN (if Interlane is true) instructions.
04929 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04930                                            bool InterLane) {
04931   MVT VT = SVOp->getSimpleValueType(0);
04932   unsigned EltSize = InterLane ? 1 :
04933     VT.getVectorElementType().getSizeInBits() >> 3;
04934 
04935   unsigned NumElts = VT.getVectorNumElements();
04936   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04937   unsigned NumLaneElts = NumElts/NumLanes;
04938 
04939   int Val = 0;
04940   unsigned i;
04941   for (i = 0; i != NumElts; ++i) {
04942     Val = SVOp->getMaskElt(i);
04943     if (Val >= 0)
04944       break;
04945   }
04946   if (Val >= (int)NumElts)
04947     Val -= NumElts - NumLaneElts;
04948 
04949   assert(Val - i > 0 && "PALIGNR imm should be positive");
04950   return (Val - i) * EltSize;
04951 }
04952 
04953 /// \brief Return the appropriate immediate to shuffle the specified
04954 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04955 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04956   return getShuffleAlignrImmediate(SVOp, false);
04957 }
04958 
04959 /// \brief Return the appropriate immediate to shuffle the specified
04960 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04961 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04962   return getShuffleAlignrImmediate(SVOp, true);
04963 }
04964 
04965 
04966 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04967   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04968   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04969     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04970 
04971   uint64_t Index =
04972     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04973 
04974   MVT VecVT = N->getOperand(0).getSimpleValueType();
04975   MVT ElVT = VecVT.getVectorElementType();
04976 
04977   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04978   return Index / NumElemsPerChunk;
04979 }
04980 
04981 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04982   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04983   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04984     llvm_unreachable("Illegal insert subvector for VINSERT");
04985 
04986   uint64_t Index =
04987     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04988 
04989   MVT VecVT = N->getSimpleValueType(0);
04990   MVT ElVT = VecVT.getVectorElementType();
04991 
04992   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04993   return Index / NumElemsPerChunk;
04994 }
04995 
04996 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04997 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04998 /// and VINSERTI128 instructions.
04999 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
05000   return getExtractVEXTRACTImmediate(N, 128);
05001 }
05002 
05003 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
05004 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
05005 /// and VINSERTI64x4 instructions.
05006 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
05007   return getExtractVEXTRACTImmediate(N, 256);
05008 }
05009 
05010 /// getInsertVINSERT128Immediate - Return the appropriate immediate
05011 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
05012 /// and VINSERTI128 instructions.
05013 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
05014   return getInsertVINSERTImmediate(N, 128);
05015 }
05016 
05017 /// getInsertVINSERT256Immediate - Return the appropriate immediate
05018 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
05019 /// and VINSERTI64x4 instructions.
05020 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
05021   return getInsertVINSERTImmediate(N, 256);
05022 }
05023 
05024 /// isZero - Returns true if Elt is a constant integer zero
05025 static bool isZero(SDValue V) {
05026   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
05027   return C && C->isNullValue();
05028 }
05029 
05030 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
05031 /// constant +0.0.
05032 bool X86::isZeroNode(SDValue Elt) {
05033   if (isZero(Elt))
05034     return true;
05035   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
05036     return CFP->getValueAPF().isPosZero();
05037   return false;
05038 }
05039 
05040 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
05041 /// match movhlps. The lower half elements should come from upper half of
05042 /// V1 (and in order), and the upper half elements should come from the upper
05043 /// half of V2 (and in order).
05044 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
05045   if (!VT.is128BitVector())
05046     return false;
05047   if (VT.getVectorNumElements() != 4)
05048     return false;
05049   for (unsigned i = 0, e = 2; i != e; ++i)
05050     if (!isUndefOrEqual(Mask[i], i+2))
05051       return false;
05052   for (unsigned i = 2; i != 4; ++i)
05053     if (!isUndefOrEqual(Mask[i], i+4))
05054       return false;
05055   return true;
05056 }
05057 
05058 /// isScalarLoadToVector - Returns true if the node is a scalar load that
05059 /// is promoted to a vector. It also returns the LoadSDNode by reference if
05060 /// required.
05061 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
05062   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
05063     return false;
05064   N = N->getOperand(0).getNode();
05065   if (!ISD::isNON_EXTLoad(N))
05066     return false;
05067   if (LD)
05068     *LD = cast<LoadSDNode>(N);
05069   return true;
05070 }
05071 
05072 // Test whether the given value is a vector value which will be legalized
05073 // into a load.
05074 static bool WillBeConstantPoolLoad(SDNode *N) {
05075   if (N->getOpcode() != ISD::BUILD_VECTOR)
05076     return false;
05077 
05078   // Check for any non-constant elements.
05079   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
05080     switch (N->getOperand(i).getNode()->getOpcode()) {
05081     case ISD::UNDEF:
05082     case ISD::ConstantFP:
05083     case ISD::Constant:
05084       break;
05085     default:
05086       return false;
05087     }
05088 
05089   // Vectors of all-zeros and all-ones are materialized with special
05090   // instructions rather than being loaded.
05091   return !ISD::isBuildVectorAllZeros(N) &&
05092          !ISD::isBuildVectorAllOnes(N);
05093 }
05094 
05095 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
05096 /// match movlp{s|d}. The lower half elements should come from lower half of
05097 /// V1 (and in order), and the upper half elements should come from the upper
05098 /// half of V2 (and in order). And since V1 will become the source of the
05099 /// MOVLP, it must be either a vector load or a scalar load to vector.
05100 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05101                                ArrayRef<int> Mask, MVT VT) {
05102   if (!VT.is128BitVector())
05103     return false;
05104 
05105   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05106     return false;
05107   // Is V2 is a vector load, don't do this transformation. We will try to use
05108   // load folding shufps op.
05109   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05110     return false;
05111 
05112   unsigned NumElems = VT.getVectorNumElements();
05113 
05114   if (NumElems != 2 && NumElems != 4)
05115     return false;
05116   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05117     if (!isUndefOrEqual(Mask[i], i))
05118       return false;
05119   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05120     if (!isUndefOrEqual(Mask[i], i+NumElems))
05121       return false;
05122   return true;
05123 }
05124 
05125 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05126 /// to an zero vector.
05127 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05128 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05129   SDValue V1 = N->getOperand(0);
05130   SDValue V2 = N->getOperand(1);
05131   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05132   for (unsigned i = 0; i != NumElems; ++i) {
05133     int Idx = N->getMaskElt(i);
05134     if (Idx >= (int)NumElems) {
05135       unsigned Opc = V2.getOpcode();
05136       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05137         continue;
05138       if (Opc != ISD::BUILD_VECTOR ||
05139           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05140         return false;
05141     } else if (Idx >= 0) {
05142       unsigned Opc = V1.getOpcode();
05143       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05144         continue;
05145       if (Opc != ISD::BUILD_VECTOR ||
05146           !X86::isZeroNode(V1.getOperand(Idx)))
05147         return false;
05148     }
05149   }
05150   return true;
05151 }
05152 
05153 /// getZeroVector - Returns a vector of specified type with all zero elements.
05154 ///
05155 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05156                              SelectionDAG &DAG, SDLoc dl) {
05157   assert(VT.isVector() && "Expected a vector type");
05158 
05159   // Always build SSE zero vectors as <4 x i32> bitcasted
05160   // to their dest type. This ensures they get CSE'd.
05161   SDValue Vec;
05162   if (VT.is128BitVector()) {  // SSE
05163     if (Subtarget->hasSSE2()) {  // SSE2
05164       SDValue Cst = DAG.getConstant(0, MVT::i32);
05165       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05166     } else { // SSE1
05167       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05168       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05169     }
05170   } else if (VT.is256BitVector()) { // AVX
05171     if (Subtarget->hasInt256()) { // AVX2
05172       SDValue Cst = DAG.getConstant(0, MVT::i32);
05173       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05174       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05175     } else {
05176       // 256-bit logic and arithmetic instructions in AVX are all
05177       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05178       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05179       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05180       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05181     }
05182   } else if (VT.is512BitVector()) { // AVX-512
05183       SDValue Cst = DAG.getConstant(0, MVT::i32);
05184       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05185                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05186       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05187   } else if (VT.getScalarType() == MVT::i1) {
05188     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05189     SDValue Cst = DAG.getConstant(0, MVT::i1);
05190     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05191     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05192   } else
05193     llvm_unreachable("Unexpected vector type");
05194 
05195   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05196 }
05197 
05198 /// getOnesVector - Returns a vector of specified type with all bits set.
05199 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05200 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05201 /// Then bitcast to their original type, ensuring they get CSE'd.
05202 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05203                              SDLoc dl) {
05204   assert(VT.isVector() && "Expected a vector type");
05205 
05206   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
05207   SDValue Vec;
05208   if (VT.is256BitVector()) {
05209     if (HasInt256) { // AVX2
05210       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05211       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05212     } else { // AVX
05213       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05214       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05215     }
05216   } else if (VT.is128BitVector()) {
05217     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05218   } else
05219     llvm_unreachable("Unexpected vector type");
05220 
05221   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05222 }
05223 
05224 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05225 /// that point to V2 points to its first element.
05226 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05227   for (unsigned i = 0; i != NumElems; ++i) {
05228     if (Mask[i] > (int)NumElems) {
05229       Mask[i] = NumElems;
05230     }
05231   }
05232 }
05233 
05234 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05235 /// operation of specified width.
05236 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05237                        SDValue V2) {
05238   unsigned NumElems = VT.getVectorNumElements();
05239   SmallVector<int, 8> Mask;
05240   Mask.push_back(NumElems);
05241   for (unsigned i = 1; i != NumElems; ++i)
05242     Mask.push_back(i);
05243   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05244 }
05245 
05246 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05247 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05248                           SDValue V2) {
05249   unsigned NumElems = VT.getVectorNumElements();
05250   SmallVector<int, 8> Mask;
05251   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05252     Mask.push_back(i);
05253     Mask.push_back(i + NumElems);
05254   }
05255   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05256 }
05257 
05258 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05259 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05260                           SDValue V2) {
05261   unsigned NumElems = VT.getVectorNumElements();
05262   SmallVector<int, 8> Mask;
05263   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05264     Mask.push_back(i + Half);
05265     Mask.push_back(i + NumElems + Half);
05266   }
05267   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05268 }
05269 
05270 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05271 // a generic shuffle instruction because the target has no such instructions.
05272 // Generate shuffles which repeat i16 and i8 several times until they can be
05273 // represented by v4f32 and then be manipulated by target suported shuffles.
05274 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05275   MVT VT = V.getSimpleValueType();
05276   int NumElems = VT.getVectorNumElements();
05277   SDLoc dl(V);
05278 
05279   while (NumElems > 4) {
05280     if (EltNo < NumElems/2) {
05281       V = getUnpackl(DAG, dl, VT, V, V);
05282     } else {
05283       V = getUnpackh(DAG, dl, VT, V, V);
05284       EltNo -= NumElems/2;
05285     }
05286     NumElems >>= 1;
05287   }
05288   return V;
05289 }
05290 
05291 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05292 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05293   MVT VT = V.getSimpleValueType();
05294   SDLoc dl(V);
05295 
05296   if (VT.is128BitVector()) {
05297     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05298     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05299     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05300                              &SplatMask[0]);
05301   } else if (VT.is256BitVector()) {
05302     // To use VPERMILPS to splat scalars, the second half of indicies must
05303     // refer to the higher part, which is a duplication of the lower one,
05304     // because VPERMILPS can only handle in-lane permutations.
05305     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05306                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05307 
05308     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05309     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05310                              &SplatMask[0]);
05311   } else
05312     llvm_unreachable("Vector size not supported");
05313 
05314   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05315 }
05316 
05317 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05318 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05319   MVT SrcVT = SV->getSimpleValueType(0);
05320   SDValue V1 = SV->getOperand(0);
05321   SDLoc dl(SV);
05322 
05323   int EltNo = SV->getSplatIndex();
05324   int NumElems = SrcVT.getVectorNumElements();
05325   bool Is256BitVec = SrcVT.is256BitVector();
05326 
05327   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05328          "Unknown how to promote splat for type");
05329 
05330   // Extract the 128-bit part containing the splat element and update
05331   // the splat element index when it refers to the higher register.
05332   if (Is256BitVec) {
05333     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05334     if (EltNo >= NumElems/2)
05335       EltNo -= NumElems/2;
05336   }
05337 
05338   // All i16 and i8 vector types can't be used directly by a generic shuffle
05339   // instruction because the target has no such instruction. Generate shuffles
05340   // which repeat i16 and i8 several times until they fit in i32, and then can
05341   // be manipulated by target suported shuffles.
05342   MVT EltVT = SrcVT.getVectorElementType();
05343   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05344     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05345 
05346   // Recreate the 256-bit vector and place the same 128-bit vector
05347   // into the low and high part. This is necessary because we want
05348   // to use VPERM* to shuffle the vectors
05349   if (Is256BitVec) {
05350     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05351   }
05352 
05353   return getLegalSplat(DAG, V1, EltNo);
05354 }
05355 
05356 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05357 /// vector of zero or undef vector.  This produces a shuffle where the low
05358 /// element of V2 is swizzled into the zero/undef vector, landing at element
05359 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05360 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05361                                            bool IsZero,
05362                                            const X86Subtarget *Subtarget,
05363                                            SelectionDAG &DAG) {
05364   MVT VT = V2.getSimpleValueType();
05365   SDValue V1 = IsZero
05366     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05367   unsigned NumElems = VT.getVectorNumElements();
05368   SmallVector<int, 16> MaskVec;
05369   for (unsigned i = 0; i != NumElems; ++i)
05370     // If this is the insertion idx, put the low elt of V2 here.
05371     MaskVec.push_back(i == Idx ? NumElems : i);
05372   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05373 }
05374 
05375 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05376 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05377 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05378 /// shuffles which use a single input multiple times, and in those cases it will
05379 /// adjust the mask to only have indices within that single input.
05380 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05381                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05382   unsigned NumElems = VT.getVectorNumElements();
05383   SDValue ImmN;
05384 
05385   IsUnary = false;
05386   bool IsFakeUnary = false;
05387   switch(N->getOpcode()) {
05388   case X86ISD::BLENDI:
05389     ImmN = N->getOperand(N->getNumOperands()-1);
05390     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05391     break;
05392   case X86ISD::SHUFP:
05393     ImmN = N->getOperand(N->getNumOperands()-1);
05394     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05395     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05396     break;
05397   case X86ISD::UNPCKH:
05398     DecodeUNPCKHMask(VT, Mask);
05399     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05400     break;
05401   case X86ISD::UNPCKL:
05402     DecodeUNPCKLMask(VT, Mask);
05403     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05404     break;
05405   case X86ISD::MOVHLPS:
05406     DecodeMOVHLPSMask(NumElems, Mask);
05407     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05408     break;
05409   case X86ISD::MOVLHPS:
05410     DecodeMOVLHPSMask(NumElems, Mask);
05411     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05412     break;
05413   case X86ISD::PALIGNR:
05414     ImmN = N->getOperand(N->getNumOperands()-1);
05415     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05416     break;
05417   case X86ISD::PSHUFD:
05418   case X86ISD::VPERMILPI:
05419     ImmN = N->getOperand(N->getNumOperands()-1);
05420     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05421     IsUnary = true;
05422     break;
05423   case X86ISD::PSHUFHW:
05424     ImmN = N->getOperand(N->getNumOperands()-1);
05425     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05426     IsUnary = true;
05427     break;
05428   case X86ISD::PSHUFLW:
05429     ImmN = N->getOperand(N->getNumOperands()-1);
05430     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05431     IsUnary = true;
05432     break;
05433   case X86ISD::PSHUFB: {
05434     IsUnary = true;
05435     SDValue MaskNode = N->getOperand(1);
05436     while (MaskNode->getOpcode() == ISD::BITCAST)
05437       MaskNode = MaskNode->getOperand(0);
05438 
05439     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05440       // If we have a build-vector, then things are easy.
05441       EVT VT = MaskNode.getValueType();
05442       assert(VT.isVector() &&
05443              "Can't produce a non-vector with a build_vector!");
05444       if (!VT.isInteger())
05445         return false;
05446 
05447       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05448 
05449       SmallVector<uint64_t, 32> RawMask;
05450       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05451         SDValue Op = MaskNode->getOperand(i);
05452         if (Op->getOpcode() == ISD::UNDEF) {
05453           RawMask.push_back((uint64_t)SM_SentinelUndef);
05454           continue;
05455         }
05456         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
05457         if (!CN)
05458           return false;
05459         APInt MaskElement = CN->getAPIntValue();
05460 
05461         // We now have to decode the element which could be any integer size and
05462         // extract each byte of it.
05463         for (int j = 0; j < NumBytesPerElement; ++j) {
05464           // Note that this is x86 and so always little endian: the low byte is
05465           // the first byte of the mask.
05466           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05467           MaskElement = MaskElement.lshr(8);
05468         }
05469       }
05470       DecodePSHUFBMask(RawMask, Mask);
05471       break;
05472     }
05473 
05474     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05475     if (!MaskLoad)
05476       return false;
05477 
05478     SDValue Ptr = MaskLoad->getBasePtr();
05479     if (Ptr->getOpcode() == X86ISD::Wrapper)
05480       Ptr = Ptr->getOperand(0);
05481 
05482     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05483     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05484       return false;
05485 
05486     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
05487       DecodePSHUFBMask(C, Mask);
05488       break;
05489     }
05490 
05491     return false;
05492   }
05493   case X86ISD::VPERMI:
05494     ImmN = N->getOperand(N->getNumOperands()-1);
05495     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05496     IsUnary = true;
05497     break;
05498   case X86ISD::MOVSS:
05499   case X86ISD::MOVSD:
05500     DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
05501     break;
05502   case X86ISD::VPERM2X128:
05503     ImmN = N->getOperand(N->getNumOperands()-1);
05504     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05505     if (Mask.empty()) return false;
05506     break;
05507   case X86ISD::MOVSLDUP:
05508     DecodeMOVSLDUPMask(VT, Mask);
05509     IsUnary = true;
05510     break;
05511   case X86ISD::MOVSHDUP:
05512     DecodeMOVSHDUPMask(VT, Mask);
05513     IsUnary = true;
05514     break;
05515   case X86ISD::MOVDDUP:
05516     DecodeMOVDDUPMask(VT, Mask);
05517     IsUnary = true;
05518     break;
05519   case X86ISD::MOVLHPD:
05520   case X86ISD::MOVLPD:
05521   case X86ISD::MOVLPS:
05522     // Not yet implemented
05523     return false;
05524   default: llvm_unreachable("unknown target shuffle node");
05525   }
05526 
05527   // If we have a fake unary shuffle, the shuffle mask is spread across two
05528   // inputs that are actually the same node. Re-map the mask to always point
05529   // into the first input.
05530   if (IsFakeUnary)
05531     for (int &M : Mask)
05532       if (M >= (int)Mask.size())
05533         M -= Mask.size();
05534 
05535   return true;
05536 }
05537 
05538 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05539 /// element of the result of the vector shuffle.
05540 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05541                                    unsigned Depth) {
05542   if (Depth == 6)
05543     return SDValue();  // Limit search depth.
05544 
05545   SDValue V = SDValue(N, 0);
05546   EVT VT = V.getValueType();
05547   unsigned Opcode = V.getOpcode();
05548 
05549   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05550   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05551     int Elt = SV->getMaskElt(Index);
05552 
05553     if (Elt < 0)
05554       return DAG.getUNDEF(VT.getVectorElementType());
05555 
05556     unsigned NumElems = VT.getVectorNumElements();
05557     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05558                                          : SV->getOperand(1);
05559     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05560   }
05561 
05562   // Recurse into target specific vector shuffles to find scalars.
05563   if (isTargetShuffle(Opcode)) {
05564     MVT ShufVT = V.getSimpleValueType();
05565     unsigned NumElems = ShufVT.getVectorNumElements();
05566     SmallVector<int, 16> ShuffleMask;
05567     bool IsUnary;
05568 
05569     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05570       return SDValue();
05571 
05572     int Elt = ShuffleMask[Index];
05573     if (Elt < 0)
05574       return DAG.getUNDEF(ShufVT.getVectorElementType());
05575 
05576     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05577                                          : N->getOperand(1);
05578     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05579                                Depth+1);
05580   }
05581 
05582   // Actual nodes that may contain scalar elements
05583   if (Opcode == ISD::BITCAST) {
05584     V = V.getOperand(0);
05585     EVT SrcVT = V.getValueType();
05586     unsigned NumElems = VT.getVectorNumElements();
05587 
05588     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05589       return SDValue();
05590   }
05591 
05592   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05593     return (Index == 0) ? V.getOperand(0)
05594                         : DAG.getUNDEF(VT.getVectorElementType());
05595 
05596   if (V.getOpcode() == ISD::BUILD_VECTOR)
05597     return V.getOperand(Index);
05598 
05599   return SDValue();
05600 }
05601 
05602 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05603 /// shuffle operation which come from a consecutively from a zero. The
05604 /// search can start in two different directions, from left or right.
05605 /// We count undefs as zeros until PreferredNum is reached.
05606 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05607                                          unsigned NumElems, bool ZerosFromLeft,
05608                                          SelectionDAG &DAG,
05609                                          unsigned PreferredNum = -1U) {
05610   unsigned NumZeros = 0;
05611   for (unsigned i = 0; i != NumElems; ++i) {
05612     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05613     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05614     if (!Elt.getNode())
05615       break;
05616 
05617     if (X86::isZeroNode(Elt))
05618       ++NumZeros;
05619     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05620       NumZeros = std::min(NumZeros + 1, PreferredNum);
05621     else
05622       break;
05623   }
05624 
05625   return NumZeros;
05626 }
05627 
05628 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05629 /// correspond consecutively to elements from one of the vector operands,
05630 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05631 static
05632 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05633                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05634                               unsigned NumElems, unsigned &OpNum) {
05635   bool SeenV1 = false;
05636   bool SeenV2 = false;
05637 
05638   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05639     int Idx = SVOp->getMaskElt(i);
05640     // Ignore undef indicies
05641     if (Idx < 0)
05642       continue;
05643 
05644     if (Idx < (int)NumElems)
05645       SeenV1 = true;
05646     else
05647       SeenV2 = true;
05648 
05649     // Only accept consecutive elements from the same vector
05650     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05651       return false;
05652   }
05653 
05654   OpNum = SeenV1 ? 0 : 1;
05655   return true;
05656 }
05657 
05658 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05659 /// logical left shift of a vector.
05660 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05661                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05662   unsigned NumElems =
05663     SVOp->getSimpleValueType(0).getVectorNumElements();
05664   unsigned NumZeros = getNumOfConsecutiveZeros(
05665       SVOp, NumElems, false /* check zeros from right */, DAG,
05666       SVOp->getMaskElt(0));
05667   unsigned OpSrc;
05668 
05669   if (!NumZeros)
05670     return false;
05671 
05672   // Considering the elements in the mask that are not consecutive zeros,
05673   // check if they consecutively come from only one of the source vectors.
05674   //
05675   //               V1 = {X, A, B, C}     0
05676   //                         \  \  \    /
05677   //   vector_shuffle V1, V2 <1, 2, 3, X>
05678   //
05679   if (!isShuffleMaskConsecutive(SVOp,
05680             0,                   // Mask Start Index
05681             NumElems-NumZeros,   // Mask End Index(exclusive)
05682             NumZeros,            // Where to start looking in the src vector
05683             NumElems,            // Number of elements in vector
05684             OpSrc))              // Which source operand ?
05685     return false;
05686 
05687   isLeft = false;
05688   ShAmt = NumZeros;
05689   ShVal = SVOp->getOperand(OpSrc);
05690   return true;
05691 }
05692 
05693 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05694 /// logical left shift of a vector.
05695 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05696                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05697   unsigned NumElems =
05698     SVOp->getSimpleValueType(0).getVectorNumElements();
05699   unsigned NumZeros = getNumOfConsecutiveZeros(
05700       SVOp, NumElems, true /* check zeros from left */, DAG,
05701       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05702   unsigned OpSrc;
05703 
05704   if (!NumZeros)
05705     return false;
05706 
05707   // Considering the elements in the mask that are not consecutive zeros,
05708   // check if they consecutively come from only one of the source vectors.
05709   //
05710   //                           0    { A, B, X, X } = V2
05711   //                          / \    /  /
05712   //   vector_shuffle V1, V2 <X, X, 4, 5>
05713   //
05714   if (!isShuffleMaskConsecutive(SVOp,
05715             NumZeros,     // Mask Start Index
05716             NumElems,     // Mask End Index(exclusive)
05717             0,            // Where to start looking in the src vector
05718             NumElems,     // Number of elements in vector
05719             OpSrc))       // Which source operand ?
05720     return false;
05721 
05722   isLeft = true;
05723   ShAmt = NumZeros;
05724   ShVal = SVOp->getOperand(OpSrc);
05725   return true;
05726 }
05727 
05728 /// isVectorShift - Returns true if the shuffle can be implemented as a
05729 /// logical left or right shift of a vector.
05730 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05731                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05732   // Although the logic below support any bitwidth size, there are no
05733   // shift instructions which handle more than 128-bit vectors.
05734   if (!SVOp->getSimpleValueType(0).is128BitVector())
05735     return false;
05736 
05737   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05738       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05739     return true;
05740 
05741   return false;
05742 }
05743 
05744 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05745 ///
05746 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05747                                        unsigned NumNonZero, unsigned NumZero,
05748                                        SelectionDAG &DAG,
05749                                        const X86Subtarget* Subtarget,
05750                                        const TargetLowering &TLI) {
05751   if (NumNonZero > 8)
05752     return SDValue();
05753 
05754   SDLoc dl(Op);
05755   SDValue V;
05756   bool First = true;
05757   for (unsigned i = 0; i < 16; ++i) {
05758     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05759     if (ThisIsNonZero && First) {
05760       if (NumZero)
05761         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05762       else
05763         V = DAG.getUNDEF(MVT::v8i16);
05764       First = false;
05765     }
05766 
05767     if ((i & 1) != 0) {
05768       SDValue ThisElt, LastElt;
05769       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05770       if (LastIsNonZero) {
05771         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05772                               MVT::i16, Op.getOperand(i-1));
05773       }
05774       if (ThisIsNonZero) {
05775         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05776         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05777                               ThisElt, DAG.getConstant(8, MVT::i8));
05778         if (LastIsNonZero)
05779           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05780       } else
05781         ThisElt = LastElt;
05782 
05783       if (ThisElt.getNode())
05784         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05785                         DAG.getIntPtrConstant(i/2));
05786     }
05787   }
05788 
05789   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05790 }
05791 
05792 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05793 ///
05794 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05795                                      unsigned NumNonZero, unsigned NumZero,
05796                                      SelectionDAG &DAG,
05797                                      const X86Subtarget* Subtarget,
05798                                      const TargetLowering &TLI) {
05799   if (NumNonZero > 4)
05800     return SDValue();
05801 
05802   SDLoc dl(Op);
05803   SDValue V;
05804   bool First = true;
05805   for (unsigned i = 0; i < 8; ++i) {
05806     bool isNonZero = (NonZeros & (1 << i)) != 0;
05807     if (isNonZero) {
05808       if (First) {
05809         if (NumZero)
05810           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05811         else
05812           V = DAG.getUNDEF(MVT::v8i16);
05813         First = false;
05814       }
05815       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05816                       MVT::v8i16, V, Op.getOperand(i),
05817                       DAG.getIntPtrConstant(i));
05818     }
05819   }
05820 
05821   return V;
05822 }
05823 
05824 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05825 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
05826                                      const X86Subtarget *Subtarget,
05827                                      const TargetLowering &TLI) {
05828   // Find all zeroable elements.
05829   bool Zeroable[4];
05830   for (int i=0; i < 4; ++i) {
05831     SDValue Elt = Op->getOperand(i);
05832     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
05833   }
05834   assert(std::count_if(&Zeroable[0], &Zeroable[4],
05835                        [](bool M) { return !M; }) > 1 &&
05836          "We expect at least two non-zero elements!");
05837 
05838   // We only know how to deal with build_vector nodes where elements are either
05839   // zeroable or extract_vector_elt with constant index.
05840   SDValue FirstNonZero;
05841   unsigned FirstNonZeroIdx;
05842   for (unsigned i=0; i < 4; ++i) {
05843     if (Zeroable[i])
05844       continue;
05845     SDValue Elt = Op->getOperand(i);
05846     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05847         !isa<ConstantSDNode>(Elt.getOperand(1)))
05848       return SDValue();
05849     // Make sure that this node is extracting from a 128-bit vector.
05850     MVT VT = Elt.getOperand(0).getSimpleValueType();
05851     if (!VT.is128BitVector())
05852       return SDValue();
05853     if (!FirstNonZero.getNode()) {
05854       FirstNonZero = Elt;
05855       FirstNonZeroIdx = i;
05856     }
05857   }
05858 
05859   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
05860   SDValue V1 = FirstNonZero.getOperand(0);
05861   MVT VT = V1.getSimpleValueType();
05862 
05863   // See if this build_vector can be lowered as a blend with zero.
05864   SDValue Elt;
05865   unsigned EltMaskIdx, EltIdx;
05866   int Mask[4];
05867   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
05868     if (Zeroable[EltIdx]) {
05869       // The zero vector will be on the right hand side.
05870       Mask[EltIdx] = EltIdx+4;
05871       continue;
05872     }
05873 
05874     Elt = Op->getOperand(EltIdx);
05875     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
05876     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
05877     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
05878       break;
05879     Mask[EltIdx] = EltIdx;
05880   }
05881 
05882   if (EltIdx == 4) {
05883     // Let the shuffle legalizer deal with blend operations.
05884     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
05885     if (V1.getSimpleValueType() != VT)
05886       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
05887     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
05888   }
05889 
05890   // See if we can lower this build_vector to a INSERTPS.
05891   if (!Subtarget->hasSSE41())
05892     return SDValue();
05893 
05894   SDValue V2 = Elt.getOperand(0);
05895   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
05896     V1 = SDValue();
05897 
05898   bool CanFold = true;
05899   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
05900     if (Zeroable[i])
05901       continue;
05902 
05903     SDValue Current = Op->getOperand(i);
05904     SDValue SrcVector = Current->getOperand(0);
05905     if (!V1.getNode())
05906       V1 = SrcVector;
05907     CanFold = SrcVector == V1 &&
05908       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
05909   }
05910 
05911   if (!CanFold)
05912     return SDValue();
05913 
05914   assert(V1.getNode() && "Expected at least two non-zero elements!");
05915   if (V1.getSimpleValueType() != MVT::v4f32)
05916     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
05917   if (V2.getSimpleValueType() != MVT::v4f32)
05918     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
05919 
05920   // Ok, we can emit an INSERTPS instruction.
05921   unsigned ZMask = 0;
05922   for (int i = 0; i < 4; ++i)
05923     if (Zeroable[i])
05924       ZMask |= 1 << i;
05925 
05926   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
05927   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
05928   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
05929                                DAG.getIntPtrConstant(InsertPSMask));
05930   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
05931 }
05932 
05933 /// Return a vector logical shift node.
05934 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05935                          unsigned NumBits, SelectionDAG &DAG,
05936                          const TargetLowering &TLI, SDLoc dl) {
05937   assert(VT.is128BitVector() && "Unknown type for VShift");
05938   MVT ShVT = MVT::v2i64;
05939   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05940   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05941   MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(SrcOp.getValueType());
05942   SDValue ShiftVal = DAG.getConstant(NumBits, ScalarShiftTy);
05943   return DAG.getNode(ISD::BITCAST, dl, VT,
05944                      DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
05945 }
05946 
05947 static SDValue
05948 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05949 
05950   // Check if the scalar load can be widened into a vector load. And if
05951   // the address is "base + cst" see if the cst can be "absorbed" into
05952   // the shuffle mask.
05953   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05954     SDValue Ptr = LD->getBasePtr();
05955     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05956       return SDValue();
05957     EVT PVT = LD->getValueType(0);
05958     if (PVT != MVT::i32 && PVT != MVT::f32)
05959       return SDValue();
05960 
05961     int FI = -1;
05962     int64_t Offset = 0;
05963     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05964       FI = FINode->getIndex();
05965       Offset = 0;
05966     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05967                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05968       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05969       Offset = Ptr.getConstantOperandVal(1);
05970       Ptr = Ptr.getOperand(0);
05971     } else {
05972       return SDValue();
05973     }
05974 
05975     // FIXME: 256-bit vector instructions don't require a strict alignment,
05976     // improve this code to support it better.
05977     unsigned RequiredAlign = VT.getSizeInBits()/8;
05978     SDValue Chain = LD->getChain();
05979     // Make sure the stack object alignment is at least 16 or 32.
05980     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05981     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05982       if (MFI->isFixedObjectIndex(FI)) {
05983         // Can't change the alignment. FIXME: It's possible to compute
05984         // the exact stack offset and reference FI + adjust offset instead.
05985         // If someone *really* cares about this. That's the way to implement it.
05986         return SDValue();
05987       } else {
05988         MFI->setObjectAlignment(FI, RequiredAlign);
05989       }
05990     }
05991 
05992     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05993     // Ptr + (Offset & ~15).
05994     if (Offset < 0)
05995       return SDValue();
05996     if ((Offset % RequiredAlign) & 3)
05997       return SDValue();
05998     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05999     if (StartOffset)
06000       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
06001                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
06002 
06003     int EltNo = (Offset - StartOffset) >> 2;
06004     unsigned NumElems = VT.getVectorNumElements();
06005 
06006     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
06007     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
06008                              LD->getPointerInfo().getWithOffset(StartOffset),
06009                              false, false, false, 0);
06010 
06011     SmallVector<int, 8> Mask;
06012     for (unsigned i = 0; i != NumElems; ++i)
06013       Mask.push_back(EltNo);
06014 
06015     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
06016   }
06017 
06018   return SDValue();
06019 }
06020 
06021 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
06022 /// vector of type 'VT', see if the elements can be replaced by a single large
06023 /// load which has the same value as a build_vector whose operands are 'elts'.
06024 ///
06025 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
06026 ///
06027 /// FIXME: we'd also like to handle the case where the last elements are zero
06028 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
06029 /// There's even a handy isZeroNode for that purpose.
06030 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
06031                                         SDLoc &DL, SelectionDAG &DAG,
06032                                         bool isAfterLegalize) {
06033   EVT EltVT = VT.getVectorElementType();
06034   unsigned NumElems = Elts.size();
06035 
06036   LoadSDNode *LDBase = nullptr;
06037   unsigned LastLoadedElt = -1U;
06038 
06039   // For each element in the initializer, see if we've found a load or an undef.
06040   // If we don't find an initial load element, or later load elements are
06041   // non-consecutive, bail out.
06042   for (unsigned i = 0; i < NumElems; ++i) {
06043     SDValue Elt = Elts[i];
06044 
06045     if (!Elt.getNode() ||
06046         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
06047       return SDValue();
06048     if (!LDBase) {
06049       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
06050         return SDValue();
06051       LDBase = cast<LoadSDNode>(Elt.getNode());
06052       LastLoadedElt = i;
06053       continue;
06054     }
06055     if (Elt.getOpcode() == ISD::UNDEF)
06056       continue;
06057 
06058     LoadSDNode *LD = cast<LoadSDNode>(Elt);
06059     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
06060       return SDValue();
06061     LastLoadedElt = i;
06062   }
06063 
06064   // If we have found an entire vector of loads and undefs, then return a large
06065   // load of the entire vector width starting at the base pointer.  If we found
06066   // consecutive loads for the low half, generate a vzext_load node.
06067   if (LastLoadedElt == NumElems - 1) {
06068 
06069     if (isAfterLegalize &&
06070         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
06071       return SDValue();
06072 
06073     SDValue NewLd = SDValue();
06074 
06075     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
06076                         LDBase->getPointerInfo(), LDBase->isVolatile(),
06077                         LDBase->isNonTemporal(), LDBase->isInvariant(),
06078                         LDBase->getAlignment());
06079 
06080     if (LDBase->hasAnyUseOfValue(1)) {
06081       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06082                                      SDValue(LDBase, 1),
06083                                      SDValue(NewLd.getNode(), 1));
06084       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06085       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06086                              SDValue(NewLd.getNode(), 1));
06087     }
06088 
06089     return NewLd;
06090   }
06091 
06092   //TODO: The code below fires only for for loading the low v2i32 / v2f32
06093   //of a v4i32 / v4f32. It's probably worth generalizing.
06094   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
06095       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
06096     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
06097     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
06098     SDValue ResNode =
06099         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
06100                                 LDBase->getPointerInfo(),
06101                                 LDBase->getAlignment(),
06102                                 false/*isVolatile*/, true/*ReadMem*/,
06103                                 false/*WriteMem*/);
06104 
06105     // Make sure the newly-created LOAD is in the same position as LDBase in
06106     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
06107     // update uses of LDBase's output chain to use the TokenFactor.
06108     if (LDBase->hasAnyUseOfValue(1)) {
06109       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06110                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
06111       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06112       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06113                              SDValue(ResNode.getNode(), 1));
06114     }
06115 
06116     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
06117   }
06118   return SDValue();
06119 }
06120 
06121 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
06122 /// to generate a splat value for the following cases:
06123 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
06124 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
06125 /// a scalar load, or a constant.
06126 /// The VBROADCAST node is returned when a pattern is found,
06127 /// or SDValue() otherwise.
06128 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
06129                                     SelectionDAG &DAG) {
06130   // VBROADCAST requires AVX.
06131   // TODO: Splats could be generated for non-AVX CPUs using SSE
06132   // instructions, but there's less potential gain for only 128-bit vectors.
06133   if (!Subtarget->hasAVX())
06134     return SDValue();
06135 
06136   MVT VT = Op.getSimpleValueType();
06137   SDLoc dl(Op);
06138 
06139   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06140          "Unsupported vector type for broadcast.");
06141 
06142   SDValue Ld;
06143   bool ConstSplatVal;
06144 
06145   switch (Op.getOpcode()) {
06146     default:
06147       // Unknown pattern found.
06148       return SDValue();
06149 
06150     case ISD::BUILD_VECTOR: {
06151       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06152       BitVector UndefElements;
06153       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06154 
06155       // We need a splat of a single value to use broadcast, and it doesn't
06156       // make any sense if the value is only in one element of the vector.
06157       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06158         return SDValue();
06159 
06160       Ld = Splat;
06161       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06162                        Ld.getOpcode() == ISD::ConstantFP);
06163 
06164       // Make sure that all of the users of a non-constant load are from the
06165       // BUILD_VECTOR node.
06166       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06167         return SDValue();
06168       break;
06169     }
06170 
06171     case ISD::VECTOR_SHUFFLE: {
06172       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06173 
06174       // Shuffles must have a splat mask where the first element is
06175       // broadcasted.
06176       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06177         return SDValue();
06178 
06179       SDValue Sc = Op.getOperand(0);
06180       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06181           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06182 
06183         if (!Subtarget->hasInt256())
06184           return SDValue();
06185 
06186         // Use the register form of the broadcast instruction available on AVX2.
06187         if (VT.getSizeInBits() >= 256)
06188           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06189         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06190       }
06191 
06192       Ld = Sc.getOperand(0);
06193       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06194                        Ld.getOpcode() == ISD::ConstantFP);
06195 
06196       // The scalar_to_vector node and the suspected
06197       // load node must have exactly one user.
06198       // Constants may have multiple users.
06199 
06200       // AVX-512 has register version of the broadcast
06201       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06202         Ld.getValueType().getSizeInBits() >= 32;
06203       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06204           !hasRegVer))
06205         return SDValue();
06206       break;
06207     }
06208   }
06209 
06210   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06211   bool IsGE256 = (VT.getSizeInBits() >= 256);
06212 
06213   // When optimizing for size, generate up to 5 extra bytes for a broadcast
06214   // instruction to save 8 or more bytes of constant pool data.
06215   // TODO: If multiple splats are generated to load the same constant,
06216   // it may be detrimental to overall size. There needs to be a way to detect
06217   // that condition to know if this is truly a size win.
06218   const Function *F = DAG.getMachineFunction().getFunction();
06219   bool OptForSize = F->getAttributes().
06220     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
06221 
06222   // Handle broadcasting a single constant scalar from the constant pool
06223   // into a vector.
06224   // On Sandybridge (no AVX2), it is still better to load a constant vector
06225   // from the constant pool and not to broadcast it from a scalar.
06226   // But override that restriction when optimizing for size.
06227   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
06228   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
06229     EVT CVT = Ld.getValueType();
06230     assert(!CVT.isVector() && "Must not broadcast a vector type");
06231 
06232     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
06233     // For size optimization, also splat v2f64 and v2i64, and for size opt
06234     // with AVX2, also splat i8 and i16.
06235     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
06236     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06237         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
06238       const Constant *C = nullptr;
06239       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06240         C = CI->getConstantIntValue();
06241       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06242         C = CF->getConstantFPValue();
06243 
06244       assert(C && "Invalid constant type");
06245 
06246       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06247       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06248       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06249       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06250                        MachinePointerInfo::getConstantPool(),
06251                        false, false, false, Alignment);
06252 
06253       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06254     }
06255   }
06256 
06257   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06258 
06259   // Handle AVX2 in-register broadcasts.
06260   if (!IsLoad && Subtarget->hasInt256() &&
06261       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06262     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06263 
06264   // The scalar source must be a normal load.
06265   if (!IsLoad)
06266     return SDValue();
06267 
06268   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06269       (Subtarget->hasVLX() && ScalarSize == 64))
06270     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06271 
06272   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06273   // double since there is no vbroadcastsd xmm
06274   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06275     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06276       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06277   }
06278 
06279   // Unsupported broadcast.
06280   return SDValue();
06281 }
06282 
06283 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06284 /// underlying vector and index.
06285 ///
06286 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06287 /// index.
06288 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06289                                          SDValue ExtIdx) {
06290   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06291   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06292     return Idx;
06293 
06294   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06295   // lowered this:
06296   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06297   // to:
06298   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06299   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06300   //                           undef)
06301   //                       Constant<0>)
06302   // In this case the vector is the extract_subvector expression and the index
06303   // is 2, as specified by the shuffle.
06304   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06305   SDValue ShuffleVec = SVOp->getOperand(0);
06306   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06307   assert(ShuffleVecVT.getVectorElementType() ==
06308          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06309 
06310   int ShuffleIdx = SVOp->getMaskElt(Idx);
06311   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06312     ExtractedFromVec = ShuffleVec;
06313     return ShuffleIdx;
06314   }
06315   return Idx;
06316 }
06317 
06318 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06319   MVT VT = Op.getSimpleValueType();
06320 
06321   // Skip if insert_vec_elt is not supported.
06322   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06323   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06324     return SDValue();
06325 
06326   SDLoc DL(Op);
06327   unsigned NumElems = Op.getNumOperands();
06328 
06329   SDValue VecIn1;
06330   SDValue VecIn2;
06331   SmallVector<unsigned, 4> InsertIndices;
06332   SmallVector<int, 8> Mask(NumElems, -1);
06333 
06334   for (unsigned i = 0; i != NumElems; ++i) {
06335     unsigned Opc = Op.getOperand(i).getOpcode();
06336 
06337     if (Opc == ISD::UNDEF)
06338       continue;
06339 
06340     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06341       // Quit if more than 1 elements need inserting.
06342       if (InsertIndices.size() > 1)
06343         return SDValue();
06344 
06345       InsertIndices.push_back(i);
06346       continue;
06347     }
06348 
06349     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06350     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06351     // Quit if non-constant index.
06352     if (!isa<ConstantSDNode>(ExtIdx))
06353       return SDValue();
06354     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06355 
06356     // Quit if extracted from vector of different type.
06357     if (ExtractedFromVec.getValueType() != VT)
06358       return SDValue();
06359 
06360     if (!VecIn1.getNode())
06361       VecIn1 = ExtractedFromVec;
06362     else if (VecIn1 != ExtractedFromVec) {
06363       if (!VecIn2.getNode())
06364         VecIn2 = ExtractedFromVec;
06365       else if (VecIn2 != ExtractedFromVec)
06366         // Quit if more than 2 vectors to shuffle
06367         return SDValue();
06368     }
06369 
06370     if (ExtractedFromVec == VecIn1)
06371       Mask[i] = Idx;
06372     else if (ExtractedFromVec == VecIn2)
06373       Mask[i] = Idx + NumElems;
06374   }
06375 
06376   if (!VecIn1.getNode())
06377     return SDValue();
06378 
06379   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06380   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06381   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06382     unsigned Idx = InsertIndices[i];
06383     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06384                      DAG.getIntPtrConstant(Idx));
06385   }
06386 
06387   return NV;
06388 }
06389 
06390 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06391 SDValue
06392 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06393 
06394   MVT VT = Op.getSimpleValueType();
06395   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06396          "Unexpected type in LowerBUILD_VECTORvXi1!");
06397 
06398   SDLoc dl(Op);
06399   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06400     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06401     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06402     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06403   }
06404 
06405   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06406     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06407     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06408     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06409   }
06410 
06411   bool AllContants = true;
06412   uint64_t Immediate = 0;
06413   int NonConstIdx = -1;
06414   bool IsSplat = true;
06415   unsigned NumNonConsts = 0;
06416   unsigned NumConsts = 0;
06417   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06418     SDValue In = Op.getOperand(idx);
06419     if (In.getOpcode() == ISD::UNDEF)
06420       continue;
06421     if (!isa<ConstantSDNode>(In)) {
06422       AllContants = false;
06423       NonConstIdx = idx;
06424       NumNonConsts++;
06425     } else {
06426       NumConsts++;
06427       if (cast<ConstantSDNode>(In)->getZExtValue())
06428       Immediate |= (1ULL << idx);
06429     }
06430     if (In != Op.getOperand(0))
06431       IsSplat = false;
06432   }
06433 
06434   if (AllContants) {
06435     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06436       DAG.getConstant(Immediate, MVT::i16));
06437     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06438                        DAG.getIntPtrConstant(0));
06439   }
06440 
06441   if (NumNonConsts == 1 && NonConstIdx != 0) {
06442     SDValue DstVec;
06443     if (NumConsts) {
06444       SDValue VecAsImm = DAG.getConstant(Immediate,
06445                                          MVT::getIntegerVT(VT.getSizeInBits()));
06446       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06447     }
06448     else
06449       DstVec = DAG.getUNDEF(VT);
06450     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06451                        Op.getOperand(NonConstIdx),
06452                        DAG.getIntPtrConstant(NonConstIdx));
06453   }
06454   if (!IsSplat && (NonConstIdx != 0))
06455     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06456   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06457   SDValue Select;
06458   if (IsSplat)
06459     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06460                           DAG.getConstant(-1, SelectVT),
06461                           DAG.getConstant(0, SelectVT));
06462   else
06463     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06464                          DAG.getConstant((Immediate | 1), SelectVT),
06465                          DAG.getConstant(Immediate, SelectVT));
06466   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06467 }
06468 
06469 /// \brief Return true if \p N implements a horizontal binop and return the
06470 /// operands for the horizontal binop into V0 and V1.
06471 ///
06472 /// This is a helper function of PerformBUILD_VECTORCombine.
06473 /// This function checks that the build_vector \p N in input implements a
06474 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06475 /// operation to match.
06476 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06477 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06478 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06479 /// arithmetic sub.
06480 ///
06481 /// This function only analyzes elements of \p N whose indices are
06482 /// in range [BaseIdx, LastIdx).
06483 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06484                               SelectionDAG &DAG,
06485                               unsigned BaseIdx, unsigned LastIdx,
06486                               SDValue &V0, SDValue &V1) {
06487   EVT VT = N->getValueType(0);
06488 
06489   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06490   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06491          "Invalid Vector in input!");
06492 
06493   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06494   bool CanFold = true;
06495   unsigned ExpectedVExtractIdx = BaseIdx;
06496   unsigned NumElts = LastIdx - BaseIdx;
06497   V0 = DAG.getUNDEF(VT);
06498   V1 = DAG.getUNDEF(VT);
06499 
06500   // Check if N implements a horizontal binop.
06501   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06502     SDValue Op = N->getOperand(i + BaseIdx);
06503 
06504     // Skip UNDEFs.
06505     if (Op->getOpcode() == ISD::UNDEF) {
06506       // Update the expected vector extract index.
06507       if (i * 2 == NumElts)
06508         ExpectedVExtractIdx = BaseIdx;
06509       ExpectedVExtractIdx += 2;
06510       continue;
06511     }
06512 
06513     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06514 
06515     if (!CanFold)
06516       break;
06517 
06518     SDValue Op0 = Op.getOperand(0);
06519     SDValue Op1 = Op.getOperand(1);
06520 
06521     // Try to match the following pattern:
06522     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06523     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06524         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06525         Op0.getOperand(0) == Op1.getOperand(0) &&
06526         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06527         isa<ConstantSDNode>(Op1.getOperand(1)));
06528     if (!CanFold)
06529       break;
06530 
06531     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06532     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06533 
06534     if (i * 2 < NumElts) {
06535       if (V0.getOpcode() == ISD::UNDEF)
06536         V0 = Op0.getOperand(0);
06537     } else {
06538       if (V1.getOpcode() == ISD::UNDEF)
06539         V1 = Op0.getOperand(0);
06540       if (i * 2 == NumElts)
06541         ExpectedVExtractIdx = BaseIdx;
06542     }
06543 
06544     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06545     if (I0 == ExpectedVExtractIdx)
06546       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06547     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06548       // Try to match the following dag sequence:
06549       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06550       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06551     } else
06552       CanFold = false;
06553 
06554     ExpectedVExtractIdx += 2;
06555   }
06556 
06557   return CanFold;
06558 }
06559 
06560 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06561 /// a concat_vector.
06562 ///
06563 /// This is a helper function of PerformBUILD_VECTORCombine.
06564 /// This function expects two 256-bit vectors called V0 and V1.
06565 /// At first, each vector is split into two separate 128-bit vectors.
06566 /// Then, the resulting 128-bit vectors are used to implement two
06567 /// horizontal binary operations.
06568 ///
06569 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06570 ///
06571 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06572 /// the two new horizontal binop.
06573 /// When Mode is set, the first horizontal binop dag node would take as input
06574 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06575 /// horizontal binop dag node would take as input the lower 128-bit of V1
06576 /// and the upper 128-bit of V1.
06577 ///   Example:
06578 ///     HADD V0_LO, V0_HI
06579 ///     HADD V1_LO, V1_HI
06580 ///
06581 /// Otherwise, the first horizontal binop dag node takes as input the lower
06582 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06583 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06584 ///   Example:
06585 ///     HADD V0_LO, V1_LO
06586 ///     HADD V0_HI, V1_HI
06587 ///
06588 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06589 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06590 /// the upper 128-bits of the result.
06591 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06592                                      SDLoc DL, SelectionDAG &DAG,
06593                                      unsigned X86Opcode, bool Mode,
06594                                      bool isUndefLO, bool isUndefHI) {
06595   EVT VT = V0.getValueType();
06596   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06597          "Invalid nodes in input!");
06598 
06599   unsigned NumElts = VT.getVectorNumElements();
06600   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06601   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06602   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06603   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06604   EVT NewVT = V0_LO.getValueType();
06605 
06606   SDValue LO = DAG.getUNDEF(NewVT);
06607   SDValue HI = DAG.getUNDEF(NewVT);
06608 
06609   if (Mode) {
06610     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06611     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06612       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06613     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06614       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06615   } else {
06616     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06617     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06618                        V1_LO->getOpcode() != ISD::UNDEF))
06619       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06620 
06621     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06622                        V1_HI->getOpcode() != ISD::UNDEF))
06623       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06624   }
06625 
06626   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06627 }
06628 
06629 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06630 /// sequence of 'vadd + vsub + blendi'.
06631 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06632                            const X86Subtarget *Subtarget) {
06633   SDLoc DL(BV);
06634   EVT VT = BV->getValueType(0);
06635   unsigned NumElts = VT.getVectorNumElements();
06636   SDValue InVec0 = DAG.getUNDEF(VT);
06637   SDValue InVec1 = DAG.getUNDEF(VT);
06638 
06639   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06640           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06641 
06642   // Odd-numbered elements in the input build vector are obtained from
06643   // adding two integer/float elements.
06644   // Even-numbered elements in the input build vector are obtained from
06645   // subtracting two integer/float elements.
06646   unsigned ExpectedOpcode = ISD::FSUB;
06647   unsigned NextExpectedOpcode = ISD::FADD;
06648   bool AddFound = false;
06649   bool SubFound = false;
06650 
06651   for (unsigned i = 0, e = NumElts; i != e; i++) {
06652     SDValue Op = BV->getOperand(i);
06653 
06654     // Skip 'undef' values.
06655     unsigned Opcode = Op.getOpcode();
06656     if (Opcode == ISD::UNDEF) {
06657       std::swap(ExpectedOpcode, NextExpectedOpcode);
06658       continue;
06659     }
06660 
06661     // Early exit if we found an unexpected opcode.
06662     if (Opcode != ExpectedOpcode)
06663       return SDValue();
06664 
06665     SDValue Op0 = Op.getOperand(0);
06666     SDValue Op1 = Op.getOperand(1);
06667 
06668     // Try to match the following pattern:
06669     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06670     // Early exit if we cannot match that sequence.
06671     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06672         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06673         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06674         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06675         Op0.getOperand(1) != Op1.getOperand(1))
06676       return SDValue();
06677 
06678     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06679     if (I0 != i)
06680       return SDValue();
06681 
06682     // We found a valid add/sub node. Update the information accordingly.
06683     if (i & 1)
06684       AddFound = true;
06685     else
06686       SubFound = true;
06687 
06688     // Update InVec0 and InVec1.
06689     if (InVec0.getOpcode() == ISD::UNDEF)
06690       InVec0 = Op0.getOperand(0);
06691     if (InVec1.getOpcode() == ISD::UNDEF)
06692       InVec1 = Op1.getOperand(0);
06693 
06694     // Make sure that operands in input to each add/sub node always
06695     // come from a same pair of vectors.
06696     if (InVec0 != Op0.getOperand(0)) {
06697       if (ExpectedOpcode == ISD::FSUB)
06698         return SDValue();
06699 
06700       // FADD is commutable. Try to commute the operands
06701       // and then test again.
06702       std::swap(Op0, Op1);
06703       if (InVec0 != Op0.getOperand(0))
06704         return SDValue();
06705     }
06706 
06707     if (InVec1 != Op1.getOperand(0))
06708       return SDValue();
06709 
06710     // Update the pair of expected opcodes.
06711     std::swap(ExpectedOpcode, NextExpectedOpcode);
06712   }
06713 
06714   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06715   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06716       InVec1.getOpcode() != ISD::UNDEF)
06717     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06718 
06719   return SDValue();
06720 }
06721 
06722 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06723                                           const X86Subtarget *Subtarget) {
06724   SDLoc DL(N);
06725   EVT VT = N->getValueType(0);
06726   unsigned NumElts = VT.getVectorNumElements();
06727   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06728   SDValue InVec0, InVec1;
06729 
06730   // Try to match an ADDSUB.
06731   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06732       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06733     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06734     if (Value.getNode())
06735       return Value;
06736   }
06737 
06738   // Try to match horizontal ADD/SUB.
06739   unsigned NumUndefsLO = 0;
06740   unsigned NumUndefsHI = 0;
06741   unsigned Half = NumElts/2;
06742 
06743   // Count the number of UNDEF operands in the build_vector in input.
06744   for (unsigned i = 0, e = Half; i != e; ++i)
06745     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06746       NumUndefsLO++;
06747 
06748   for (unsigned i = Half, e = NumElts; i != e; ++i)
06749     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06750       NumUndefsHI++;
06751 
06752   // Early exit if this is either a build_vector of all UNDEFs or all the
06753   // operands but one are UNDEF.
06754   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06755     return SDValue();
06756 
06757   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06758     // Try to match an SSE3 float HADD/HSUB.
06759     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06760       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06761 
06762     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06763       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06764   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06765     // Try to match an SSSE3 integer HADD/HSUB.
06766     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06767       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06768 
06769     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06770       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06771   }
06772 
06773   if (!Subtarget->hasAVX())
06774     return SDValue();
06775 
06776   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06777     // Try to match an AVX horizontal add/sub of packed single/double
06778     // precision floating point values from 256-bit vectors.
06779     SDValue InVec2, InVec3;
06780     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06781         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06782         ((InVec0.getOpcode() == ISD::UNDEF ||
06783           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06784         ((InVec1.getOpcode() == ISD::UNDEF ||
06785           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06786       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06787 
06788     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06789         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06790         ((InVec0.getOpcode() == ISD::UNDEF ||
06791           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06792         ((InVec1.getOpcode() == ISD::UNDEF ||
06793           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06794       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06795   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06796     // Try to match an AVX2 horizontal add/sub of signed integers.
06797     SDValue InVec2, InVec3;
06798     unsigned X86Opcode;
06799     bool CanFold = true;
06800 
06801     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06802         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06803         ((InVec0.getOpcode() == ISD::UNDEF ||
06804           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06805         ((InVec1.getOpcode() == ISD::UNDEF ||
06806           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06807       X86Opcode = X86ISD::HADD;
06808     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06809         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06810         ((InVec0.getOpcode() == ISD::UNDEF ||
06811           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06812         ((InVec1.getOpcode() == ISD::UNDEF ||
06813           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06814       X86Opcode = X86ISD::HSUB;
06815     else
06816       CanFold = false;
06817 
06818     if (CanFold) {
06819       // Fold this build_vector into a single horizontal add/sub.
06820       // Do this only if the target has AVX2.
06821       if (Subtarget->hasAVX2())
06822         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06823 
06824       // Do not try to expand this build_vector into a pair of horizontal
06825       // add/sub if we can emit a pair of scalar add/sub.
06826       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06827         return SDValue();
06828 
06829       // Convert this build_vector into a pair of horizontal binop followed by
06830       // a concat vector.
06831       bool isUndefLO = NumUndefsLO == Half;
06832       bool isUndefHI = NumUndefsHI == Half;
06833       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06834                                    isUndefLO, isUndefHI);
06835     }
06836   }
06837 
06838   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06839        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06840     unsigned X86Opcode;
06841     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06842       X86Opcode = X86ISD::HADD;
06843     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06844       X86Opcode = X86ISD::HSUB;
06845     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06846       X86Opcode = X86ISD::FHADD;
06847     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06848       X86Opcode = X86ISD::FHSUB;
06849     else
06850       return SDValue();
06851 
06852     // Don't try to expand this build_vector into a pair of horizontal add/sub
06853     // if we can simply emit a pair of scalar add/sub.
06854     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06855       return SDValue();
06856 
06857     // Convert this build_vector into two horizontal add/sub followed by
06858     // a concat vector.
06859     bool isUndefLO = NumUndefsLO == Half;
06860     bool isUndefHI = NumUndefsHI == Half;
06861     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06862                                  isUndefLO, isUndefHI);
06863   }
06864 
06865   return SDValue();
06866 }
06867 
06868 SDValue
06869 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06870   SDLoc dl(Op);
06871 
06872   MVT VT = Op.getSimpleValueType();
06873   MVT ExtVT = VT.getVectorElementType();
06874   unsigned NumElems = Op.getNumOperands();
06875 
06876   // Generate vectors for predicate vectors.
06877   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06878     return LowerBUILD_VECTORvXi1(Op, DAG);
06879 
06880   // Vectors containing all zeros can be matched by pxor and xorps later
06881   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06882     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06883     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06884     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06885       return Op;
06886 
06887     return getZeroVector(VT, Subtarget, DAG, dl);
06888   }
06889 
06890   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06891   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06892   // vpcmpeqd on 256-bit vectors.
06893   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06894     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06895       return Op;
06896 
06897     if (!VT.is512BitVector())
06898       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06899   }
06900 
06901   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06902   if (Broadcast.getNode())
06903     return Broadcast;
06904 
06905   unsigned EVTBits = ExtVT.getSizeInBits();
06906 
06907   unsigned NumZero  = 0;
06908   unsigned NumNonZero = 0;
06909   unsigned NonZeros = 0;
06910   bool IsAllConstants = true;
06911   SmallSet<SDValue, 8> Values;
06912   for (unsigned i = 0; i < NumElems; ++i) {
06913     SDValue Elt = Op.getOperand(i);
06914     if (Elt.getOpcode() == ISD::UNDEF)
06915       continue;
06916     Values.insert(Elt);
06917     if (Elt.getOpcode() != ISD::Constant &&
06918         Elt.getOpcode() != ISD::ConstantFP)
06919       IsAllConstants = false;
06920     if (X86::isZeroNode(Elt))
06921       NumZero++;
06922     else {
06923       NonZeros |= (1 << i);
06924       NumNonZero++;
06925     }
06926   }
06927 
06928   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06929   if (NumNonZero == 0)
06930     return DAG.getUNDEF(VT);
06931 
06932   // Special case for single non-zero, non-undef, element.
06933   if (NumNonZero == 1) {
06934     unsigned Idx = countTrailingZeros(NonZeros);
06935     SDValue Item = Op.getOperand(Idx);
06936 
06937     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06938     // the value are obviously zero, truncate the value to i32 and do the
06939     // insertion that way.  Only do this if the value is non-constant or if the
06940     // value is a constant being inserted into element 0.  It is cheaper to do
06941     // a constant pool load than it is to do a movd + shuffle.
06942     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06943         (!IsAllConstants || Idx == 0)) {
06944       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06945         // Handle SSE only.
06946         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06947         EVT VecVT = MVT::v4i32;
06948         unsigned VecElts = 4;
06949 
06950         // Truncate the value (which may itself be a constant) to i32, and
06951         // convert it to a vector with movd (S2V+shuffle to zero extend).
06952         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06953         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06954 
06955         // If using the new shuffle lowering, just directly insert this.
06956         if (ExperimentalVectorShuffleLowering)
06957           return DAG.getNode(
06958               ISD::BITCAST, dl, VT,
06959               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06960 
06961         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06962 
06963         // Now we have our 32-bit value zero extended in the low element of
06964         // a vector.  If Idx != 0, swizzle it into place.
06965         if (Idx != 0) {
06966           SmallVector<int, 4> Mask;
06967           Mask.push_back(Idx);
06968           for (unsigned i = 1; i != VecElts; ++i)
06969             Mask.push_back(i);
06970           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06971                                       &Mask[0]);
06972         }
06973         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06974       }
06975     }
06976 
06977     // If we have a constant or non-constant insertion into the low element of
06978     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06979     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06980     // depending on what the source datatype is.
06981     if (Idx == 0) {
06982       if (NumZero == 0)
06983         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06984 
06985       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06986           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06987         if (VT.is256BitVector() || VT.is512BitVector()) {
06988           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06989           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06990                              Item, DAG.getIntPtrConstant(0));
06991         }
06992         assert(VT.is128BitVector() && "Expected an SSE value type!");
06993         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06994         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06995         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06996       }
06997 
06998       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06999         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
07000         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
07001         if (VT.is256BitVector()) {
07002           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
07003           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
07004         } else {
07005           assert(VT.is128BitVector() && "Expected an SSE value type!");
07006           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
07007         }
07008         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
07009       }
07010     }
07011 
07012     // Is it a vector logical left shift?
07013     if (NumElems == 2 && Idx == 1 &&
07014         X86::isZeroNode(Op.getOperand(0)) &&
07015         !X86::isZeroNode(Op.getOperand(1))) {
07016       unsigned NumBits = VT.getSizeInBits();
07017       return getVShift(true, VT,
07018                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
07019                                    VT, Op.getOperand(1)),
07020                        NumBits/2, DAG, *this, dl);
07021     }
07022 
07023     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
07024       return SDValue();
07025 
07026     // Otherwise, if this is a vector with i32 or f32 elements, and the element
07027     // is a non-constant being inserted into an element other than the low one,
07028     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
07029     // movd/movss) to move this into the low element, then shuffle it into
07030     // place.
07031     if (EVTBits == 32) {
07032       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
07033 
07034       // If using the new shuffle lowering, just directly insert this.
07035       if (ExperimentalVectorShuffleLowering)
07036         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
07037 
07038       // Turn it into a shuffle of zero and zero-extended scalar to vector.
07039       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
07040       SmallVector<int, 8> MaskVec;
07041       for (unsigned i = 0; i != NumElems; ++i)
07042         MaskVec.push_back(i == Idx ? 0 : 1);
07043       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
07044     }
07045   }
07046 
07047   // Splat is obviously ok. Let legalizer expand it to a shuffle.
07048   if (Values.size() == 1) {
07049     if (EVTBits == 32) {
07050       // Instead of a shuffle like this:
07051       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
07052       // Check if it's possible to issue this instead.
07053       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
07054       unsigned Idx = countTrailingZeros(NonZeros);
07055       SDValue Item = Op.getOperand(Idx);
07056       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
07057         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
07058     }
07059     return SDValue();
07060   }
07061 
07062   // A vector full of immediates; various special cases are already
07063   // handled, so this is best done with a single constant-pool load.
07064   if (IsAllConstants)
07065     return SDValue();
07066 
07067   // For AVX-length vectors, see if we can use a vector load to get all of the
07068   // elements, otherwise build the individual 128-bit pieces and use
07069   // shuffles to put them in place.
07070   if (VT.is256BitVector() || VT.is512BitVector()) {
07071     SmallVector<SDValue, 64> V;
07072     for (unsigned i = 0; i != NumElems; ++i)
07073       V.push_back(Op.getOperand(i));
07074 
07075     // Check for a build vector of consecutive loads.
07076     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
07077       return LD;
07078 
07079     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
07080 
07081     // Build both the lower and upper subvector.
07082     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07083                                 makeArrayRef(&V[0], NumElems/2));
07084     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07085                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
07086 
07087     // Recreate the wider vector with the lower and upper part.
07088     if (VT.is256BitVector())
07089       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07090     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07091   }
07092 
07093   // Let legalizer expand 2-wide build_vectors.
07094   if (EVTBits == 64) {
07095     if (NumNonZero == 1) {
07096       // One half is zero or undef.
07097       unsigned Idx = countTrailingZeros(NonZeros);
07098       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
07099                                  Op.getOperand(Idx));
07100       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
07101     }
07102     return SDValue();
07103   }
07104 
07105   // If element VT is < 32 bits, convert it to inserts into a zero vector.
07106   if (EVTBits == 8 && NumElems == 16) {
07107     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
07108                                         Subtarget, *this);
07109     if (V.getNode()) return V;
07110   }
07111 
07112   if (EVTBits == 16 && NumElems == 8) {
07113     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
07114                                       Subtarget, *this);
07115     if (V.getNode()) return V;
07116   }
07117 
07118   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
07119   if (EVTBits == 32 && NumElems == 4) {
07120     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
07121     if (V.getNode())
07122       return V;
07123   }
07124 
07125   // If element VT is == 32 bits, turn it into a number of shuffles.
07126   SmallVector<SDValue, 8> V(NumElems);
07127   if (NumElems == 4 && NumZero > 0) {
07128     for (unsigned i = 0; i < 4; ++i) {
07129       bool isZero = !(NonZeros & (1 << i));
07130       if (isZero)
07131         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
07132       else
07133         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07134     }
07135 
07136     for (unsigned i = 0; i < 2; ++i) {
07137       switch ((NonZe