LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallBitVector.h"
00023 #include "llvm/ADT/SmallSet.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/ADT/StringSwitch.h"
00027 #include "llvm/ADT/VariadicFunction.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/IR/CallSite.h"
00036 #include "llvm/IR/CallingConv.h"
00037 #include "llvm/IR/Constants.h"
00038 #include "llvm/IR/DerivedTypes.h"
00039 #include "llvm/IR/Function.h"
00040 #include "llvm/IR/GlobalAlias.h"
00041 #include "llvm/IR/GlobalVariable.h"
00042 #include "llvm/IR/Instructions.h"
00043 #include "llvm/IR/Intrinsics.h"
00044 #include "llvm/MC/MCAsmInfo.h"
00045 #include "llvm/MC/MCContext.h"
00046 #include "llvm/MC/MCExpr.h"
00047 #include "llvm/MC/MCSymbol.h"
00048 #include "llvm/Support/CommandLine.h"
00049 #include "llvm/Support/Debug.h"
00050 #include "llvm/Support/ErrorHandling.h"
00051 #include "llvm/Support/MathExtras.h"
00052 #include "llvm/Target/TargetOptions.h"
00053 #include "X86IntrinsicsInfo.h"
00054 #include <bitset>
00055 #include <numeric>
00056 #include <cctype>
00057 using namespace llvm;
00058 
00059 #define DEBUG_TYPE "x86-isel"
00060 
00061 STATISTIC(NumTailCalls, "Number of tail calls");
00062 
00063 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00064     "x86-experimental-vector-widening-legalization", cl::init(false),
00065     cl::desc("Enable an experimental vector type legalization through widening "
00066              "rather than promotion."),
00067     cl::Hidden);
00068 
00069 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00070     "x86-experimental-vector-shuffle-lowering", cl::init(true),
00071     cl::desc("Enable an experimental vector shuffle lowering code path."),
00072     cl::Hidden);
00073 
00074 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00075     "x86-recip-refinement-steps", cl::init(1),
00076     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00077              "result of the hardware reciprocal estimate instruction."),
00078     cl::NotHidden);
00079 
00080 // Forward declarations.
00081 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00082                        SDValue V2);
00083 
00084 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00085                                 SelectionDAG &DAG, SDLoc dl,
00086                                 unsigned vectorWidth) {
00087   assert((vectorWidth == 128 || vectorWidth == 256) &&
00088          "Unsupported vector width");
00089   EVT VT = Vec.getValueType();
00090   EVT ElVT = VT.getVectorElementType();
00091   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00092   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00093                                   VT.getVectorNumElements()/Factor);
00094 
00095   // Extract from UNDEF is UNDEF.
00096   if (Vec.getOpcode() == ISD::UNDEF)
00097     return DAG.getUNDEF(ResultVT);
00098 
00099   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00100   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00101 
00102   // This is the index of the first element of the vectorWidth-bit chunk
00103   // we want.
00104   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00105                                * ElemsPerChunk);
00106 
00107   // If the input is a buildvector just emit a smaller one.
00108   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00109     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00110                        makeArrayRef(Vec->op_begin() + NormalizedIdxVal,
00111                                     ElemsPerChunk));
00112 
00113   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00114   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
00115 }
00116 
00117 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00118 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00119 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00120 /// instructions or a simple subregister reference. Idx is an index in the
00121 /// 128 bits we want.  It need not be aligned to a 128-bit boundary.  That makes
00122 /// lowering EXTRACT_VECTOR_ELT operations easier.
00123 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00124                                    SelectionDAG &DAG, SDLoc dl) {
00125   assert((Vec.getValueType().is256BitVector() ||
00126           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00127   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00128 }
00129 
00130 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00131 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00132                                    SelectionDAG &DAG, SDLoc dl) {
00133   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00134   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00135 }
00136 
00137 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00138                                unsigned IdxVal, SelectionDAG &DAG,
00139                                SDLoc dl, unsigned vectorWidth) {
00140   assert((vectorWidth == 128 || vectorWidth == 256) &&
00141          "Unsupported vector width");
00142   // Inserting UNDEF is Result
00143   if (Vec.getOpcode() == ISD::UNDEF)
00144     return Result;
00145   EVT VT = Vec.getValueType();
00146   EVT ElVT = VT.getVectorElementType();
00147   EVT ResultVT = Result.getValueType();
00148 
00149   // Insert the relevant vectorWidth bits.
00150   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00151 
00152   // This is the index of the first element of the vectorWidth-bit chunk
00153   // we want.
00154   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00155                                * ElemsPerChunk);
00156 
00157   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00158   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
00159 }
00160 
00161 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00162 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00163 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00164 /// simple superregister reference.  Idx is an index in the 128 bits
00165 /// we want.  It need not be aligned to a 128-bit boundary.  That makes
00166 /// lowering INSERT_VECTOR_ELT operations easier.
00167 static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00168                                   SelectionDAG &DAG,SDLoc dl) {
00169   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00170   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00171 }
00172 
00173 static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
00174                                   SelectionDAG &DAG, SDLoc dl) {
00175   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00176   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00177 }
00178 
00179 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00180 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00181 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00182 /// large BUILD_VECTORS.
00183 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00184                                    unsigned NumElems, SelectionDAG &DAG,
00185                                    SDLoc dl) {
00186   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00187   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00188 }
00189 
00190 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00191                                    unsigned NumElems, SelectionDAG &DAG,
00192                                    SDLoc dl) {
00193   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00194   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00195 }
00196 
00197 // FIXME: This should stop caching the target machine as soon as
00198 // we can remove resetOperationActions et al.
00199 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
00200     : TargetLowering(TM) {
00201   Subtarget = &TM.getSubtarget<X86Subtarget>();
00202   X86ScalarSSEf64 = Subtarget->hasSSE2();
00203   X86ScalarSSEf32 = Subtarget->hasSSE1();
00204   TD = getDataLayout();
00205 
00206   resetOperationActions();
00207 }
00208 
00209 void X86TargetLowering::resetOperationActions() {
00210   const TargetMachine &TM = getTargetMachine();
00211   static bool FirstTimeThrough = true;
00212 
00213   // If none of the target options have changed, then we don't need to reset the
00214   // operation actions.
00215   if (!FirstTimeThrough && TO == TM.Options) return;
00216 
00217   if (!FirstTimeThrough) {
00218     // Reinitialize the actions.
00219     initActions();
00220     FirstTimeThrough = false;
00221   }
00222 
00223   TO = TM.Options;
00224 
00225   // Set up the TargetLowering object.
00226   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00227 
00228   // X86 is weird. It always uses i8 for shift amounts and setcc results.
00229   setBooleanContents(ZeroOrOneBooleanContent);
00230   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00231   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00232 
00233   // For 64-bit, since we have so many registers, use the ILP scheduler.
00234   // For 32-bit, use the register pressure specific scheduling.
00235   // For Atom, always use ILP scheduling.
00236   if (Subtarget->isAtom())
00237     setSchedulingPreference(Sched::ILP);
00238   else if (Subtarget->is64Bit())
00239     setSchedulingPreference(Sched::ILP);
00240   else
00241     setSchedulingPreference(Sched::RegPressure);
00242   const X86RegisterInfo *RegInfo =
00243       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00244   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00245 
00246   // Bypass expensive divides on Atom when compiling with O2.
00247   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00248     if (Subtarget->hasSlowDivide32())
00249       addBypassSlowDiv(32, 8);
00250     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00251       addBypassSlowDiv(64, 16);
00252   }
00253 
00254   if (Subtarget->isTargetKnownWindowsMSVC()) {
00255     // Setup Windows compiler runtime calls.
00256     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00257     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00258     setLibcallName(RTLIB::SREM_I64, "_allrem");
00259     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00260     setLibcallName(RTLIB::MUL_I64, "_allmul");
00261     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00262     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00263     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00264     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00265     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00266 
00267     // The _ftol2 runtime function has an unusual calling conv, which
00268     // is modeled by a special pseudo-instruction.
00269     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00270     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00271     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00272     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00273   }
00274 
00275   if (Subtarget->isTargetDarwin()) {
00276     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00277     setUseUnderscoreSetJmp(false);
00278     setUseUnderscoreLongJmp(false);
00279   } else if (Subtarget->isTargetWindowsGNU()) {
00280     // MS runtime is weird: it exports _setjmp, but longjmp!
00281     setUseUnderscoreSetJmp(true);
00282     setUseUnderscoreLongJmp(false);
00283   } else {
00284     setUseUnderscoreSetJmp(true);
00285     setUseUnderscoreLongJmp(true);
00286   }
00287 
00288   // Set up the register classes.
00289   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00290   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00291   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00292   if (Subtarget->is64Bit())
00293     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00294 
00295   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00296 
00297   // We don't accept any truncstore of integer registers.
00298   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00299   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00300   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00301   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00302   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00303   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00304 
00305   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00306 
00307   // SETOEQ and SETUNE require checking two conditions.
00308   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00309   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00310   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00311   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00312   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00313   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00314 
00315   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00316   // operation.
00317   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00318   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00319   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00320 
00321   if (Subtarget->is64Bit()) {
00322     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00323     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00324   } else if (!TM.Options.UseSoftFloat) {
00325     // We have an algorithm for SSE2->double, and we turn this into a
00326     // 64-bit FILD followed by conditional FADD for other targets.
00327     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00328     // We have an algorithm for SSE2, and we turn this into a 64-bit
00329     // FILD for other targets.
00330     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00331   }
00332 
00333   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00334   // this operation.
00335   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00336   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00337 
00338   if (!TM.Options.UseSoftFloat) {
00339     // SSE has no i16 to fp conversion, only i32
00340     if (X86ScalarSSEf32) {
00341       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00342       // f32 and f64 cases are Legal, f80 case is not
00343       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00344     } else {
00345       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00346       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00347     }
00348   } else {
00349     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00350     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00351   }
00352 
00353   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00354   // are Legal, f80 is custom lowered.
00355   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00356   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00357 
00358   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00359   // this operation.
00360   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00361   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00362 
00363   if (X86ScalarSSEf32) {
00364     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00365     // f32 and f64 cases are Legal, f80 case is not
00366     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00367   } else {
00368     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00369     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00370   }
00371 
00372   // Handle FP_TO_UINT by promoting the destination to a larger signed
00373   // conversion.
00374   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00375   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00376   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00377 
00378   if (Subtarget->is64Bit()) {
00379     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00380     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00381   } else if (!TM.Options.UseSoftFloat) {
00382     // Since AVX is a superset of SSE3, only check for SSE here.
00383     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00384       // Expand FP_TO_UINT into a select.
00385       // FIXME: We would like to use a Custom expander here eventually to do
00386       // the optimal thing for SSE vs. the default expansion in the legalizer.
00387       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00388     else
00389       // With SSE3 we can use fisttpll to convert to a signed i64; without
00390       // SSE, we're stuck with a fistpll.
00391       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00392   }
00393 
00394   if (isTargetFTOL()) {
00395     // Use the _ftol2 runtime function, which has a pseudo-instruction
00396     // to handle its weird calling convention.
00397     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00398   }
00399 
00400   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00401   if (!X86ScalarSSEf64) {
00402     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00403     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00404     if (Subtarget->is64Bit()) {
00405       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00406       // Without SSE, i64->f64 goes through memory.
00407       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00408     }
00409   }
00410 
00411   // Scalar integer divide and remainder are lowered to use operations that
00412   // produce two results, to match the available instructions. This exposes
00413   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00414   // into a single instruction.
00415   //
00416   // Scalar integer multiply-high is also lowered to use two-result
00417   // operations, to match the available instructions. However, plain multiply
00418   // (low) operations are left as Legal, as there are single-result
00419   // instructions for this in x86. Using the two-result multiply instructions
00420   // when both high and low results are needed must be arranged by dagcombine.
00421   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00422     MVT VT = IntVTs[i];
00423     setOperationAction(ISD::MULHS, VT, Expand);
00424     setOperationAction(ISD::MULHU, VT, Expand);
00425     setOperationAction(ISD::SDIV, VT, Expand);
00426     setOperationAction(ISD::UDIV, VT, Expand);
00427     setOperationAction(ISD::SREM, VT, Expand);
00428     setOperationAction(ISD::UREM, VT, Expand);
00429 
00430     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00431     setOperationAction(ISD::ADDC, VT, Custom);
00432     setOperationAction(ISD::ADDE, VT, Custom);
00433     setOperationAction(ISD::SUBC, VT, Custom);
00434     setOperationAction(ISD::SUBE, VT, Custom);
00435   }
00436 
00437   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00438   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00439   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00440   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00441   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00442   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00443   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00444   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00445   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00446   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00447   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00448   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00449   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00450   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00451   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00452   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00453   if (Subtarget->is64Bit())
00454     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00455   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00456   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00457   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00458   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00459   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00460   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00461   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00462   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00463 
00464   // Promote the i8 variants and force them on up to i32 which has a shorter
00465   // encoding.
00466   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00467   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00468   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00469   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00470   if (Subtarget->hasBMI()) {
00471     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00472     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00473     if (Subtarget->is64Bit())
00474       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00475   } else {
00476     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00477     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00478     if (Subtarget->is64Bit())
00479       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00480   }
00481 
00482   if (Subtarget->hasLZCNT()) {
00483     // When promoting the i8 variants, force them to i32 for a shorter
00484     // encoding.
00485     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00486     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00487     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00488     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00489     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00490     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00491     if (Subtarget->is64Bit())
00492       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00493   } else {
00494     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00495     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00496     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00497     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00498     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00499     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00500     if (Subtarget->is64Bit()) {
00501       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00502       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00503     }
00504   }
00505 
00506   // Special handling for half-precision floating point conversions.
00507   // If we don't have F16C support, then lower half float conversions
00508   // into library calls.
00509   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00510     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00511     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00512   }
00513 
00514   // There's never any support for operations beyond MVT::f32.
00515   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00516   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00517   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00518   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00519 
00520   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00521   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00522   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00523   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00524 
00525   if (Subtarget->hasPOPCNT()) {
00526     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00527   } else {
00528     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00529     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00530     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00531     if (Subtarget->is64Bit())
00532       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00533   }
00534 
00535   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00536 
00537   if (!Subtarget->hasMOVBE())
00538     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00539 
00540   // These should be promoted to a larger select which is supported.
00541   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00542   // X86 wants to expand cmov itself.
00543   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00544   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00545   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00546   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00547   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00548   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00549   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00550   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00551   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00552   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00553   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00554   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00555   if (Subtarget->is64Bit()) {
00556     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00557     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00558   }
00559   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00560   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00561   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00562   // support continuation, user-level threading, and etc.. As a result, no
00563   // other SjLj exception interfaces are implemented and please don't build
00564   // your own exception handling based on them.
00565   // LLVM/Clang supports zero-cost DWARF exception handling.
00566   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00567   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00568 
00569   // Darwin ABI issue.
00570   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00571   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00572   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00573   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00574   if (Subtarget->is64Bit())
00575     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00576   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00577   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00578   if (Subtarget->is64Bit()) {
00579     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00580     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00581     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00582     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00583     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00584   }
00585   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00586   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00587   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00588   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00589   if (Subtarget->is64Bit()) {
00590     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00591     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00592     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00593   }
00594 
00595   if (Subtarget->hasSSE1())
00596     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00597 
00598   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00599 
00600   // Expand certain atomics
00601   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00602     MVT VT = IntVTs[i];
00603     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00604     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00605     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00606   }
00607 
00608   if (Subtarget->hasCmpxchg16b()) {
00609     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00610   }
00611 
00612   // FIXME - use subtarget debug flags
00613   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00614       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00615     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00616   }
00617 
00618   if (Subtarget->is64Bit()) {
00619     setExceptionPointerRegister(X86::RAX);
00620     setExceptionSelectorRegister(X86::RDX);
00621   } else {
00622     setExceptionPointerRegister(X86::EAX);
00623     setExceptionSelectorRegister(X86::EDX);
00624   }
00625   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00626   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00627 
00628   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00629   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00630 
00631   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00632   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00633 
00634   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00635   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00636   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00637   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00638     // TargetInfo::X86_64ABIBuiltinVaList
00639     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00640     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00641   } else {
00642     // TargetInfo::CharPtrBuiltinVaList
00643     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00644     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00645   }
00646 
00647   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00648   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00649 
00650   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00651 
00652   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00653     // f32 and f64 use SSE.
00654     // Set up the FP register classes.
00655     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00656     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00657 
00658     // Use ANDPD to simulate FABS.
00659     setOperationAction(ISD::FABS , MVT::f64, Custom);
00660     setOperationAction(ISD::FABS , MVT::f32, Custom);
00661 
00662     // Use XORP to simulate FNEG.
00663     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00664     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00665 
00666     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00667     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00668     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00669 
00670     // Lower this to FGETSIGNx86 plus an AND.
00671     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00672     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00673 
00674     // We don't support sin/cos/fmod
00675     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00676     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00677     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00678     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00679     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00680     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00681 
00682     // Expand FP immediates into loads from the stack, except for the special
00683     // cases we handle.
00684     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00685     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00686   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00687     // Use SSE for f32, x87 for f64.
00688     // Set up the FP register classes.
00689     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00690     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00691 
00692     // Use ANDPS to simulate FABS.
00693     setOperationAction(ISD::FABS , MVT::f32, Custom);
00694 
00695     // Use XORP to simulate FNEG.
00696     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00697 
00698     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00699 
00700     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00701     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00702     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00703 
00704     // We don't support sin/cos/fmod
00705     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00706     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00707     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00708 
00709     // Special cases we handle for FP constants.
00710     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00711     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00712     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00713     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00714     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00715 
00716     if (!TM.Options.UnsafeFPMath) {
00717       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00718       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00719       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00720     }
00721   } else if (!TM.Options.UseSoftFloat) {
00722     // f32 and f64 in x87.
00723     // Set up the FP register classes.
00724     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00725     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00726 
00727     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00728     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00729     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00730     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00731 
00732     if (!TM.Options.UnsafeFPMath) {
00733       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00734       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00735       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00736       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00737       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00738       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00739     }
00740     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00741     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00742     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00743     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00744     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00745     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00746     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00747     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00748   }
00749 
00750   // We don't support FMA.
00751   setOperationAction(ISD::FMA, MVT::f64, Expand);
00752   setOperationAction(ISD::FMA, MVT::f32, Expand);
00753 
00754   // Long double always uses X87.
00755   if (!TM.Options.UseSoftFloat) {
00756     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00757     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00758     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00759     {
00760       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00761       addLegalFPImmediate(TmpFlt);  // FLD0
00762       TmpFlt.changeSign();
00763       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00764 
00765       bool ignored;
00766       APFloat TmpFlt2(+1.0);
00767       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00768                       &ignored);
00769       addLegalFPImmediate(TmpFlt2);  // FLD1
00770       TmpFlt2.changeSign();
00771       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00772     }
00773 
00774     if (!TM.Options.UnsafeFPMath) {
00775       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00776       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00777       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00778     }
00779 
00780     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00781     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00782     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00783     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00784     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00785     setOperationAction(ISD::FMA, MVT::f80, Expand);
00786   }
00787 
00788   // Always use a library call for pow.
00789   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00790   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00791   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00792 
00793   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00794   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00795   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00796   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00797   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00798   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00799   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00800 
00801   // First set operation action for all vector types to either promote
00802   // (for widening) or expand (for scalarization). Then we will selectively
00803   // turn on ones that can be effectively codegen'd.
00804   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00805            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00806     MVT VT = (MVT::SimpleValueType)i;
00807     setOperationAction(ISD::ADD , VT, Expand);
00808     setOperationAction(ISD::SUB , VT, Expand);
00809     setOperationAction(ISD::FADD, VT, Expand);
00810     setOperationAction(ISD::FNEG, VT, Expand);
00811     setOperationAction(ISD::FSUB, VT, Expand);
00812     setOperationAction(ISD::MUL , VT, Expand);
00813     setOperationAction(ISD::FMUL, VT, Expand);
00814     setOperationAction(ISD::SDIV, VT, Expand);
00815     setOperationAction(ISD::UDIV, VT, Expand);
00816     setOperationAction(ISD::FDIV, VT, Expand);
00817     setOperationAction(ISD::SREM, VT, Expand);
00818     setOperationAction(ISD::UREM, VT, Expand);
00819     setOperationAction(ISD::LOAD, VT, Expand);
00820     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00821     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00822     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00823     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00824     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00825     setOperationAction(ISD::FABS, VT, Expand);
00826     setOperationAction(ISD::FSIN, VT, Expand);
00827     setOperationAction(ISD::FSINCOS, VT, Expand);
00828     setOperationAction(ISD::FCOS, VT, Expand);
00829     setOperationAction(ISD::FSINCOS, VT, Expand);
00830     setOperationAction(ISD::FREM, VT, Expand);
00831     setOperationAction(ISD::FMA,  VT, Expand);
00832     setOperationAction(ISD::FPOWI, VT, Expand);
00833     setOperationAction(ISD::FSQRT, VT, Expand);
00834     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00835     setOperationAction(ISD::FFLOOR, VT, Expand);
00836     setOperationAction(ISD::FCEIL, VT, Expand);
00837     setOperationAction(ISD::FTRUNC, VT, Expand);
00838     setOperationAction(ISD::FRINT, VT, Expand);
00839     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00840     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00841     setOperationAction(ISD::MULHS, VT, Expand);
00842     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00843     setOperationAction(ISD::MULHU, VT, Expand);
00844     setOperationAction(ISD::SDIVREM, VT, Expand);
00845     setOperationAction(ISD::UDIVREM, VT, Expand);
00846     setOperationAction(ISD::FPOW, VT, Expand);
00847     setOperationAction(ISD::CTPOP, VT, Expand);
00848     setOperationAction(ISD::CTTZ, VT, Expand);
00849     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00850     setOperationAction(ISD::CTLZ, VT, Expand);
00851     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00852     setOperationAction(ISD::SHL, VT, Expand);
00853     setOperationAction(ISD::SRA, VT, Expand);
00854     setOperationAction(ISD::SRL, VT, Expand);
00855     setOperationAction(ISD::ROTL, VT, Expand);
00856     setOperationAction(ISD::ROTR, VT, Expand);
00857     setOperationAction(ISD::BSWAP, VT, Expand);
00858     setOperationAction(ISD::SETCC, VT, Expand);
00859     setOperationAction(ISD::FLOG, VT, Expand);
00860     setOperationAction(ISD::FLOG2, VT, Expand);
00861     setOperationAction(ISD::FLOG10, VT, Expand);
00862     setOperationAction(ISD::FEXP, VT, Expand);
00863     setOperationAction(ISD::FEXP2, VT, Expand);
00864     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00865     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00866     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00867     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00868     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00869     setOperationAction(ISD::TRUNCATE, VT, Expand);
00870     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00871     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00872     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00873     setOperationAction(ISD::VSELECT, VT, Expand);
00874     setOperationAction(ISD::SELECT_CC, VT, Expand);
00875     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00876              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00877       setTruncStoreAction(VT,
00878                           (MVT::SimpleValueType)InnerVT, Expand);
00879     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00880     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00881 
00882     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00883     // we have to deal with them whether we ask for Expansion or not. Setting
00884     // Expand causes its own optimisation problems though, so leave them legal.
00885     if (VT.getVectorElementType() == MVT::i1)
00886       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00887   }
00888 
00889   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00890   // with -msoft-float, disable use of MMX as well.
00891   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00892     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00893     // No operations on x86mmx supported, everything uses intrinsics.
00894   }
00895 
00896   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00897   // into smaller operations.
00898   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00899   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00900   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00901   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00902   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00903   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00904   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00905   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00906   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00907   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00908   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00909   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00910   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00911   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00912   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00913   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00914   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00915   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00916   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00917   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00918   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00919   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00920   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00921   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00922   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00923   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00924   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00925   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00926   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00927 
00928   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00929     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00930 
00931     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00932     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00933     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00934     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00935     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00936     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00937     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00938     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00939     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00940     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00941     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00942     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00943     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00944   }
00945 
00946   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00947     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00948 
00949     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00950     // registers cannot be used even for integer operations.
00951     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00952     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00953     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00954     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00955 
00956     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00957     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00958     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00959     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00960     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00961     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00962     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00963     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00964     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00965     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00966     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00967     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00968     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00969     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00970     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00971     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00972     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00973     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00974     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00975     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00976     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00977     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00978 
00979     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00980     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00981     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00982     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00983 
00984     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00985     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00986     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00987     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00988     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00989 
00990     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00991     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00992       MVT VT = (MVT::SimpleValueType)i;
00993       // Do not attempt to custom lower non-power-of-2 vectors
00994       if (!isPowerOf2_32(VT.getVectorNumElements()))
00995         continue;
00996       // Do not attempt to custom lower non-128-bit vectors
00997       if (!VT.is128BitVector())
00998         continue;
00999       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01000       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01001       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01002     }
01003 
01004     // We support custom legalizing of sext and anyext loads for specific
01005     // memory vector types which we can load as a scalar (or sequence of
01006     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01007     // loads these must work with a single scalar load.
01008     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01009     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01010     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01011     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01012     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01013     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01014     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01015     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01016     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01017 
01018     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01019     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01020     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01021     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01022     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01023     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01024 
01025     if (Subtarget->is64Bit()) {
01026       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01027       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01028     }
01029 
01030     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01031     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01032       MVT VT = (MVT::SimpleValueType)i;
01033 
01034       // Do not attempt to promote non-128-bit vectors
01035       if (!VT.is128BitVector())
01036         continue;
01037 
01038       setOperationAction(ISD::AND,    VT, Promote);
01039       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01040       setOperationAction(ISD::OR,     VT, Promote);
01041       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01042       setOperationAction(ISD::XOR,    VT, Promote);
01043       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01044       setOperationAction(ISD::LOAD,   VT, Promote);
01045       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01046       setOperationAction(ISD::SELECT, VT, Promote);
01047       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01048     }
01049 
01050     // Custom lower v2i64 and v2f64 selects.
01051     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01052     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01053     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01054     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01055 
01056     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01057     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01058 
01059     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01060     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01061     // As there is no 64-bit GPR available, we need build a special custom
01062     // sequence to convert from v2i32 to v2f32.
01063     if (!Subtarget->is64Bit())
01064       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01065 
01066     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01067     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01068 
01069     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01070 
01071     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01072     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01073     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01074   }
01075 
01076   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01077     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01078     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01079     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01080     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01081     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01082     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01083     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01084     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01085     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01086     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01087 
01088     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01089     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01090     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01091     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01092     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01093     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01094     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01095     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01096     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01097     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01098 
01099     // FIXME: Do we need to handle scalar-to-vector here?
01100     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01101 
01102     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01103     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01104     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01105     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01106     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01107     // There is no BLENDI for byte vectors. We don't need to custom lower
01108     // some vselects for now.
01109     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01110 
01111     // SSE41 brings specific instructions for doing vector sign extend even in
01112     // cases where we don't have SRA.
01113     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01114     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01115     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01116 
01117     // i8 and i16 vectors are custom because the source register and source
01118     // source memory operand types are not the same width.  f32 vectors are
01119     // custom since the immediate controlling the insert encodes additional
01120     // information.
01121     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01122     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01123     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01124     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01125 
01126     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01127     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01128     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01129     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01130 
01131     // FIXME: these should be Legal, but that's only for the case where
01132     // the index is constant.  For now custom expand to deal with that.
01133     if (Subtarget->is64Bit()) {
01134       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01135       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01136     }
01137   }
01138 
01139   if (Subtarget->hasSSE2()) {
01140     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01141     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01142 
01143     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01144     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01145 
01146     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01147     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01148 
01149     // In the customized shift lowering, the legal cases in AVX2 will be
01150     // recognized.
01151     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01152     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01153 
01154     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01155     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01156 
01157     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01158   }
01159 
01160   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01161     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01162     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01163     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01164     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01165     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01166     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01167 
01168     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01169     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01170     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01171 
01172     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01173     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01174     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01175     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01176     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01177     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01178     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01179     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01180     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01181     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01182     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01183     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01184 
01185     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01186     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01187     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01188     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01189     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01190     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01191     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01192     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01193     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01194     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01195     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01196     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01197 
01198     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01199     // even though v8i16 is a legal type.
01200     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01201     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01202     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01203 
01204     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01205     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01206     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01207 
01208     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01209     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01210 
01211     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01212 
01213     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01214     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01215 
01216     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01217     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01218 
01219     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01220     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01221 
01222     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01223     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01224     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01225     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01226 
01227     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01228     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01229     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01230 
01231     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01232     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01233     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01234     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01235 
01236     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01237     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01238     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01239     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01240     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01241     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01242     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01243     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01244     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01245     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01246     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01247     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01248 
01249     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01250       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01251       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01252       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01253       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01254       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01255       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01256     }
01257 
01258     if (Subtarget->hasInt256()) {
01259       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01260       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01261       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01262       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01263 
01264       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01265       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01266       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01267       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01268 
01269       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01270       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01271       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01272       // Don't lower v32i8 because there is no 128-bit byte mul
01273 
01274       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01275       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01276       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01277       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01278 
01279       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01280       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01281 
01282       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01283       // when we have a 256bit-wide blend with immediate.
01284       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01285     } else {
01286       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01287       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01288       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01289       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01290 
01291       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01292       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01293       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01294       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01295 
01296       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01297       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01298       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01299       // Don't lower v32i8 because there is no 128-bit byte mul
01300     }
01301 
01302     // In the customized shift lowering, the legal cases in AVX2 will be
01303     // recognized.
01304     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01305     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01306 
01307     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01308     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01309 
01310     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01311 
01312     // Custom lower several nodes for 256-bit types.
01313     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01314              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01315       MVT VT = (MVT::SimpleValueType)i;
01316 
01317       if (VT.getScalarSizeInBits() >= 32) {
01318         setOperationAction(ISD::MLOAD,  VT, Legal);
01319         setOperationAction(ISD::MSTORE, VT, Legal);
01320       }
01321       // Extract subvector is special because the value type
01322       // (result) is 128-bit but the source is 256-bit wide.
01323       if (VT.is128BitVector()) {
01324         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01325       }
01326       // Do not attempt to custom lower other non-256-bit vectors
01327       if (!VT.is256BitVector())
01328         continue;
01329 
01330       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01331       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01332       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01333       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01334       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01335       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01336       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01337     }
01338 
01339     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01340     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01341       MVT VT = (MVT::SimpleValueType)i;
01342 
01343       // Do not attempt to promote non-256-bit vectors
01344       if (!VT.is256BitVector())
01345         continue;
01346 
01347       setOperationAction(ISD::AND,    VT, Promote);
01348       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01349       setOperationAction(ISD::OR,     VT, Promote);
01350       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01351       setOperationAction(ISD::XOR,    VT, Promote);
01352       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01353       setOperationAction(ISD::LOAD,   VT, Promote);
01354       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01355       setOperationAction(ISD::SELECT, VT, Promote);
01356       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01357     }
01358   }
01359 
01360   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01361     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01362     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01363     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01364     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01365 
01366     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01367     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01368     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01369 
01370     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01371     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01372     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01373     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01374     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01375     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01376     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01377     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01378     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01379     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01380     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01381 
01382     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01383     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01384     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01385     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01386     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01387     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01388 
01389     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01390     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01391     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01392     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01393     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01394     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01395     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01396     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01397 
01398     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01399     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01400     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01401     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01402     if (Subtarget->is64Bit()) {
01403       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01404       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01405       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01406       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01407     }
01408     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01409     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01410     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01411     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01412     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01413     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01414     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01415     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01416     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01417     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01418     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01419     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01420     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01421     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01422 
01423     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01424     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01425     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01426     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01427     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01428     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01429     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01430     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01431     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01432     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01433     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01434     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01435     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01436 
01437     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01438     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01439     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01440     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01441     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01442     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01443 
01444     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01445     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01446 
01447     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01448 
01449     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01450     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01451     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01452     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01453     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01454     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01455     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01456     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01457     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01458 
01459     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01460     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01461 
01462     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01463     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01464 
01465     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01466 
01467     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01468     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01469 
01470     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01471     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01472 
01473     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01474     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01475 
01476     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01477     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01478     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01479     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01480     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01481     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01482 
01483     if (Subtarget->hasCDI()) {
01484       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01485       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01486     }
01487 
01488     // Custom lower several nodes.
01489     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01490              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01491       MVT VT = (MVT::SimpleValueType)i;
01492 
01493       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01494       // Extract subvector is special because the value type
01495       // (result) is 256/128-bit but the source is 512-bit wide.
01496       if (VT.is128BitVector() || VT.is256BitVector()) {
01497         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01498       }
01499       if (VT.getVectorElementType() == MVT::i1)
01500         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01501 
01502       // Do not attempt to custom lower other non-512-bit vectors
01503       if (!VT.is512BitVector())
01504         continue;
01505 
01506       if ( EltSize >= 32) {
01507         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01508         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01509         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01510         setOperationAction(ISD::VSELECT,             VT, Legal);
01511         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01512         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01513         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01514         setOperationAction(ISD::MLOAD,               VT, Legal);
01515         setOperationAction(ISD::MSTORE,              VT, Legal);
01516       }
01517     }
01518     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01519       MVT VT = (MVT::SimpleValueType)i;
01520 
01521       // Do not attempt to promote non-256-bit vectors.
01522       if (!VT.is512BitVector())
01523         continue;
01524 
01525       setOperationAction(ISD::SELECT, VT, Promote);
01526       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01527     }
01528   }// has  AVX-512
01529 
01530   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01531     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01532     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01533 
01534     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01535     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01536 
01537     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01538     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01539     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01540     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01541     setOperationAction(ISD::ADD,                MVT::v32i16, Legal);
01542     setOperationAction(ISD::ADD,                MVT::v64i8, Legal);
01543     setOperationAction(ISD::SUB,                MVT::v32i16, Legal);
01544     setOperationAction(ISD::SUB,                MVT::v64i8, Legal);
01545     setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
01546 
01547     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01548       const MVT VT = (MVT::SimpleValueType)i;
01549 
01550       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01551 
01552       // Do not attempt to promote non-256-bit vectors.
01553       if (!VT.is512BitVector())
01554         continue;
01555 
01556       if (EltSize < 32) {
01557         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01558         setOperationAction(ISD::VSELECT,             VT, Legal);
01559       }
01560     }
01561   }
01562 
01563   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01564     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01565     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01566 
01567     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01568     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01569     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01570 
01571     setOperationAction(ISD::AND,                MVT::v8i32, Legal);
01572     setOperationAction(ISD::OR,                 MVT::v8i32, Legal);
01573     setOperationAction(ISD::XOR,                MVT::v8i32, Legal);
01574     setOperationAction(ISD::AND,                MVT::v4i32, Legal);
01575     setOperationAction(ISD::OR,                 MVT::v4i32, Legal);
01576     setOperationAction(ISD::XOR,                MVT::v4i32, Legal);
01577   }
01578 
01579   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01580   // of this type with custom code.
01581   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01582            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01583     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01584                        Custom);
01585   }
01586 
01587   // We want to custom lower some of our intrinsics.
01588   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01589   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01590   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01591   if (!Subtarget->is64Bit())
01592     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01593 
01594   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01595   // handle type legalization for these operations here.
01596   //
01597   // FIXME: We really should do custom legalization for addition and
01598   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01599   // than generic legalization for 64-bit multiplication-with-overflow, though.
01600   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01601     // Add/Sub/Mul with overflow operations are custom lowered.
01602     MVT VT = IntVTs[i];
01603     setOperationAction(ISD::SADDO, VT, Custom);
01604     setOperationAction(ISD::UADDO, VT, Custom);
01605     setOperationAction(ISD::SSUBO, VT, Custom);
01606     setOperationAction(ISD::USUBO, VT, Custom);
01607     setOperationAction(ISD::SMULO, VT, Custom);
01608     setOperationAction(ISD::UMULO, VT, Custom);
01609   }
01610 
01611 
01612   if (!Subtarget->is64Bit()) {
01613     // These libcalls are not available in 32-bit.
01614     setLibcallName(RTLIB::SHL_I128, nullptr);
01615     setLibcallName(RTLIB::SRL_I128, nullptr);
01616     setLibcallName(RTLIB::SRA_I128, nullptr);
01617   }
01618 
01619   // Combine sin / cos into one node or libcall if possible.
01620   if (Subtarget->hasSinCos()) {
01621     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01622     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01623     if (Subtarget->isTargetDarwin()) {
01624       // For MacOSX, we don't want the normal expansion of a libcall to sincos.
01625       // We want to issue a libcall to __sincos_stret to avoid memory traffic.
01626       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01627       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01628     }
01629   }
01630 
01631   if (Subtarget->isTargetWin64()) {
01632     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01633     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01634     setOperationAction(ISD::SREM, MVT::i128, Custom);
01635     setOperationAction(ISD::UREM, MVT::i128, Custom);
01636     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01637     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01638   }
01639 
01640   // We have target-specific dag combine patterns for the following nodes:
01641   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01642   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01643   setTargetDAGCombine(ISD::VSELECT);
01644   setTargetDAGCombine(ISD::SELECT);
01645   setTargetDAGCombine(ISD::SHL);
01646   setTargetDAGCombine(ISD::SRA);
01647   setTargetDAGCombine(ISD::SRL);
01648   setTargetDAGCombine(ISD::OR);
01649   setTargetDAGCombine(ISD::AND);
01650   setTargetDAGCombine(ISD::ADD);
01651   setTargetDAGCombine(ISD::FADD);
01652   setTargetDAGCombine(ISD::FSUB);
01653   setTargetDAGCombine(ISD::FMA);
01654   setTargetDAGCombine(ISD::SUB);
01655   setTargetDAGCombine(ISD::LOAD);
01656   setTargetDAGCombine(ISD::STORE);
01657   setTargetDAGCombine(ISD::ZERO_EXTEND);
01658   setTargetDAGCombine(ISD::ANY_EXTEND);
01659   setTargetDAGCombine(ISD::SIGN_EXTEND);
01660   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01661   setTargetDAGCombine(ISD::TRUNCATE);
01662   setTargetDAGCombine(ISD::SINT_TO_FP);
01663   setTargetDAGCombine(ISD::SETCC);
01664   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01665   setTargetDAGCombine(ISD::BUILD_VECTOR);
01666   if (Subtarget->is64Bit())
01667     setTargetDAGCombine(ISD::MUL);
01668   setTargetDAGCombine(ISD::XOR);
01669 
01670   computeRegisterProperties();
01671 
01672   // On Darwin, -Os means optimize for size without hurting performance,
01673   // do not reduce the limit.
01674   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01675   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01676   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01677   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01678   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01679   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01680   setPrefLoopAlignment(4); // 2^4 bytes.
01681 
01682   // Predictable cmov don't hurt on atom because it's in-order.
01683   PredictableSelectIsExpensive = !Subtarget->isAtom();
01684   EnableExtLdPromotion = true;
01685   setPrefFunctionAlignment(4); // 2^4 bytes.
01686 
01687   verifyIntrinsicTables();
01688 }
01689 
01690 // This has so far only been implemented for 64-bit MachO.
01691 bool X86TargetLowering::useLoadStackGuardNode() const {
01692   return Subtarget->isTargetMachO() && Subtarget->is64Bit();
01693 }
01694 
01695 TargetLoweringBase::LegalizeTypeAction
01696 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01697   if (ExperimentalVectorWideningLegalization &&
01698       VT.getVectorNumElements() != 1 &&
01699       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01700     return TypeWidenVector;
01701 
01702   return TargetLoweringBase::getPreferredVectorAction(VT);
01703 }
01704 
01705 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01706   if (!VT.isVector())
01707     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01708 
01709   const unsigned NumElts = VT.getVectorNumElements();
01710   const EVT EltVT = VT.getVectorElementType();
01711   if (VT.is512BitVector()) {
01712     if (Subtarget->hasAVX512())
01713       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01714           EltVT == MVT::f32 || EltVT == MVT::f64)
01715         switch(NumElts) {
01716         case  8: return MVT::v8i1;
01717         case 16: return MVT::v16i1;
01718       }
01719     if (Subtarget->hasBWI())
01720       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01721         switch(NumElts) {
01722         case 32: return MVT::v32i1;
01723         case 64: return MVT::v64i1;
01724       }
01725   }
01726 
01727   if (VT.is256BitVector() || VT.is128BitVector()) {
01728     if (Subtarget->hasVLX())
01729       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01730           EltVT == MVT::f32 || EltVT == MVT::f64)
01731         switch(NumElts) {
01732         case 2: return MVT::v2i1;
01733         case 4: return MVT::v4i1;
01734         case 8: return MVT::v8i1;
01735       }
01736     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01737       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01738         switch(NumElts) {
01739         case  8: return MVT::v8i1;
01740         case 16: return MVT::v16i1;
01741         case 32: return MVT::v32i1;
01742       }
01743   }
01744 
01745   return VT.changeVectorElementTypeToInteger();
01746 }
01747 
01748 /// Helper for getByValTypeAlignment to determine
01749 /// the desired ByVal argument alignment.
01750 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01751   if (MaxAlign == 16)
01752     return;
01753   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01754     if (VTy->getBitWidth() == 128)
01755       MaxAlign = 16;
01756   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01757     unsigned EltAlign = 0;
01758     getMaxByValAlign(ATy->getElementType(), EltAlign);
01759     if (EltAlign > MaxAlign)
01760       MaxAlign = EltAlign;
01761   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01762     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01763       unsigned EltAlign = 0;
01764       getMaxByValAlign(STy->getElementType(i), EltAlign);
01765       if (EltAlign > MaxAlign)
01766         MaxAlign = EltAlign;
01767       if (MaxAlign == 16)
01768         break;
01769     }
01770   }
01771 }
01772 
01773 /// Return the desired alignment for ByVal aggregate
01774 /// function arguments in the caller parameter area. For X86, aggregates
01775 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01776 /// are at 4-byte boundaries.
01777 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01778   if (Subtarget->is64Bit()) {
01779     // Max of 8 and alignment of type.
01780     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01781     if (TyAlign > 8)
01782       return TyAlign;
01783     return 8;
01784   }
01785 
01786   unsigned Align = 4;
01787   if (Subtarget->hasSSE1())
01788     getMaxByValAlign(Ty, Align);
01789   return Align;
01790 }
01791 
01792 /// Returns the target specific optimal type for load
01793 /// and store operations as a result of memset, memcpy, and memmove
01794 /// lowering. If DstAlign is zero that means it's safe to destination
01795 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01796 /// means there isn't a need to check it against alignment requirement,
01797 /// probably because the source does not need to be loaded. If 'IsMemset' is
01798 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01799 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01800 /// source is constant so it does not need to be loaded.
01801 /// It returns EVT::Other if the type should be determined using generic
01802 /// target-independent logic.
01803 EVT
01804 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01805                                        unsigned DstAlign, unsigned SrcAlign,
01806                                        bool IsMemset, bool ZeroMemset,
01807                                        bool MemcpyStrSrc,
01808                                        MachineFunction &MF) const {
01809   const Function *F = MF.getFunction();
01810   if ((!IsMemset || ZeroMemset) &&
01811       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01812                                        Attribute::NoImplicitFloat)) {
01813     if (Size >= 16 &&
01814         (Subtarget->isUnalignedMemAccessFast() ||
01815          ((DstAlign == 0 || DstAlign >= 16) &&
01816           (SrcAlign == 0 || SrcAlign >= 16)))) {
01817       if (Size >= 32) {
01818         if (Subtarget->hasInt256())
01819           return MVT::v8i32;
01820         if (Subtarget->hasFp256())
01821           return MVT::v8f32;
01822       }
01823       if (Subtarget->hasSSE2())
01824         return MVT::v4i32;
01825       if (Subtarget->hasSSE1())
01826         return MVT::v4f32;
01827     } else if (!MemcpyStrSrc && Size >= 8 &&
01828                !Subtarget->is64Bit() &&
01829                Subtarget->hasSSE2()) {
01830       // Do not use f64 to lower memcpy if source is string constant. It's
01831       // better to use i32 to avoid the loads.
01832       return MVT::f64;
01833     }
01834   }
01835   if (Subtarget->is64Bit() && Size >= 8)
01836     return MVT::i64;
01837   return MVT::i32;
01838 }
01839 
01840 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01841   if (VT == MVT::f32)
01842     return X86ScalarSSEf32;
01843   else if (VT == MVT::f64)
01844     return X86ScalarSSEf64;
01845   return true;
01846 }
01847 
01848 bool
01849 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01850                                                   unsigned,
01851                                                   unsigned,
01852                                                   bool *Fast) const {
01853   if (Fast)
01854     *Fast = Subtarget->isUnalignedMemAccessFast();
01855   return true;
01856 }
01857 
01858 /// Return the entry encoding for a jump table in the
01859 /// current function.  The returned value is a member of the
01860 /// MachineJumpTableInfo::JTEntryKind enum.
01861 unsigned X86TargetLowering::getJumpTableEncoding() const {
01862   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01863   // symbol.
01864   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01865       Subtarget->isPICStyleGOT())
01866     return MachineJumpTableInfo::EK_Custom32;
01867 
01868   // Otherwise, use the normal jump table encoding heuristics.
01869   return TargetLowering::getJumpTableEncoding();
01870 }
01871 
01872 const MCExpr *
01873 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01874                                              const MachineBasicBlock *MBB,
01875                                              unsigned uid,MCContext &Ctx) const{
01876   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01877          Subtarget->isPICStyleGOT());
01878   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01879   // entries.
01880   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01881                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01882 }
01883 
01884 /// Returns relocation base for the given PIC jumptable.
01885 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01886                                                     SelectionDAG &DAG) const {
01887   if (!Subtarget->is64Bit())
01888     // This doesn't have SDLoc associated with it, but is not really the
01889     // same as a Register.
01890     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01891   return Table;
01892 }
01893 
01894 /// This returns the relocation base for the given PIC jumptable,
01895 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
01896 const MCExpr *X86TargetLowering::
01897 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01898                              MCContext &Ctx) const {
01899   // X86-64 uses RIP relative addressing based on the jump table label.
01900   if (Subtarget->isPICStyleRIPRel())
01901     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01902 
01903   // Otherwise, the reference is relative to the PIC base.
01904   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01905 }
01906 
01907 // FIXME: Why this routine is here? Move to RegInfo!
01908 std::pair<const TargetRegisterClass*, uint8_t>
01909 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01910   const TargetRegisterClass *RRC = nullptr;
01911   uint8_t Cost = 1;
01912   switch (VT.SimpleTy) {
01913   default:
01914     return TargetLowering::findRepresentativeClass(VT);
01915   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01916     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01917     break;
01918   case MVT::x86mmx:
01919     RRC = &X86::VR64RegClass;
01920     break;
01921   case MVT::f32: case MVT::f64:
01922   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01923   case MVT::v4f32: case MVT::v2f64:
01924   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01925   case MVT::v4f64:
01926     RRC = &X86::VR128RegClass;
01927     break;
01928   }
01929   return std::make_pair(RRC, Cost);
01930 }
01931 
01932 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01933                                                unsigned &Offset) const {
01934   if (!Subtarget->isTargetLinux())
01935     return false;
01936 
01937   if (Subtarget->is64Bit()) {
01938     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01939     Offset = 0x28;
01940     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01941       AddressSpace = 256;
01942     else
01943       AddressSpace = 257;
01944   } else {
01945     // %gs:0x14 on i386
01946     Offset = 0x14;
01947     AddressSpace = 256;
01948   }
01949   return true;
01950 }
01951 
01952 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01953                                             unsigned DestAS) const {
01954   assert(SrcAS != DestAS && "Expected different address spaces!");
01955 
01956   return SrcAS < 256 && DestAS < 256;
01957 }
01958 
01959 //===----------------------------------------------------------------------===//
01960 //               Return Value Calling Convention Implementation
01961 //===----------------------------------------------------------------------===//
01962 
01963 #include "X86GenCallingConv.inc"
01964 
01965 bool
01966 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01967                                   MachineFunction &MF, bool isVarArg,
01968                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01969                         LLVMContext &Context) const {
01970   SmallVector<CCValAssign, 16> RVLocs;
01971   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01972   return CCInfo.CheckReturn(Outs, RetCC_X86);
01973 }
01974 
01975 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01976   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01977   return ScratchRegs;
01978 }
01979 
01980 SDValue
01981 X86TargetLowering::LowerReturn(SDValue Chain,
01982                                CallingConv::ID CallConv, bool isVarArg,
01983                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01984                                const SmallVectorImpl<SDValue> &OutVals,
01985                                SDLoc dl, SelectionDAG &DAG) const {
01986   MachineFunction &MF = DAG.getMachineFunction();
01987   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01988 
01989   SmallVector<CCValAssign, 16> RVLocs;
01990   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01991   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01992 
01993   SDValue Flag;
01994   SmallVector<SDValue, 6> RetOps;
01995   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
01996   // Operand #1 = Bytes To Pop
01997   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
01998                    MVT::i16));
01999 
02000   // Copy the result values into the output registers.
02001   for (unsigned i = 0; i != RVLocs.size(); ++i) {
02002     CCValAssign &VA = RVLocs[i];
02003     assert(VA.isRegLoc() && "Can only return in registers!");
02004     SDValue ValToCopy = OutVals[i];
02005     EVT ValVT = ValToCopy.getValueType();
02006 
02007     // Promote values to the appropriate types.
02008     if (VA.getLocInfo() == CCValAssign::SExt)
02009       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02010     else if (VA.getLocInfo() == CCValAssign::ZExt)
02011       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02012     else if (VA.getLocInfo() == CCValAssign::AExt)
02013       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02014     else if (VA.getLocInfo() == CCValAssign::BCvt)
02015       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02016 
02017     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02018            "Unexpected FP-extend for return value.");
02019 
02020     // If this is x86-64, and we disabled SSE, we can't return FP values,
02021     // or SSE or MMX vectors.
02022     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02023          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02024           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02025       report_fatal_error("SSE register return with SSE disabled");
02026     }
02027     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02028     // llvm-gcc has never done it right and no one has noticed, so this
02029     // should be OK for now.
02030     if (ValVT == MVT::f64 &&
02031         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02032       report_fatal_error("SSE2 register return with SSE2 disabled");
02033 
02034     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02035     // the RET instruction and handled by the FP Stackifier.
02036     if (VA.getLocReg() == X86::FP0 ||
02037         VA.getLocReg() == X86::FP1) {
02038       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02039       // change the value to the FP stack register class.
02040       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02041         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02042       RetOps.push_back(ValToCopy);
02043       // Don't emit a copytoreg.
02044       continue;
02045     }
02046 
02047     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02048     // which is returned in RAX / RDX.
02049     if (Subtarget->is64Bit()) {
02050       if (ValVT == MVT::x86mmx) {
02051         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02052           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02053           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02054                                   ValToCopy);
02055           // If we don't have SSE2 available, convert to v4f32 so the generated
02056           // register is legal.
02057           if (!Subtarget->hasSSE2())
02058             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02059         }
02060       }
02061     }
02062 
02063     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02064     Flag = Chain.getValue(1);
02065     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02066   }
02067 
02068   // The x86-64 ABIs require that for returning structs by value we copy
02069   // the sret argument into %rax/%eax (depending on ABI) for the return.
02070   // Win32 requires us to put the sret argument to %eax as well.
02071   // We saved the argument into a virtual register in the entry block,
02072   // so now we copy the value out and into %rax/%eax.
02073   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02074       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02075     MachineFunction &MF = DAG.getMachineFunction();
02076     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02077     unsigned Reg = FuncInfo->getSRetReturnReg();
02078     assert(Reg &&
02079            "SRetReturnReg should have been set in LowerFormalArguments().");
02080     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02081 
02082     unsigned RetValReg
02083         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02084           X86::RAX : X86::EAX;
02085     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02086     Flag = Chain.getValue(1);
02087 
02088     // RAX/EAX now acts like a return value.
02089     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02090   }
02091 
02092   RetOps[0] = Chain;  // Update chain.
02093 
02094   // Add the flag if we have it.
02095   if (Flag.getNode())
02096     RetOps.push_back(Flag);
02097 
02098   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02099 }
02100 
02101 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02102   if (N->getNumValues() != 1)
02103     return false;
02104   if (!N->hasNUsesOfValue(1, 0))
02105     return false;
02106 
02107   SDValue TCChain = Chain;
02108   SDNode *Copy = *N->use_begin();
02109   if (Copy->getOpcode() == ISD::CopyToReg) {
02110     // If the copy has a glue operand, we conservatively assume it isn't safe to
02111     // perform a tail call.
02112     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02113       return false;
02114     TCChain = Copy->getOperand(0);
02115   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02116     return false;
02117 
02118   bool HasRet = false;
02119   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02120        UI != UE; ++UI) {
02121     if (UI->getOpcode() != X86ISD::RET_FLAG)
02122       return false;
02123     // If we are returning more than one value, we can definitely
02124     // not make a tail call see PR19530
02125     if (UI->getNumOperands() > 4)
02126       return false;
02127     if (UI->getNumOperands() == 4 &&
02128         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02129       return false;
02130     HasRet = true;
02131   }
02132 
02133   if (!HasRet)
02134     return false;
02135 
02136   Chain = TCChain;
02137   return true;
02138 }
02139 
02140 EVT
02141 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02142                                             ISD::NodeType ExtendKind) const {
02143   MVT ReturnMVT;
02144   // TODO: Is this also valid on 32-bit?
02145   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02146     ReturnMVT = MVT::i8;
02147   else
02148     ReturnMVT = MVT::i32;
02149 
02150   EVT MinVT = getRegisterType(Context, ReturnMVT);
02151   return VT.bitsLT(MinVT) ? MinVT : VT;
02152 }
02153 
02154 /// Lower the result values of a call into the
02155 /// appropriate copies out of appropriate physical registers.
02156 ///
02157 SDValue
02158 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02159                                    CallingConv::ID CallConv, bool isVarArg,
02160                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02161                                    SDLoc dl, SelectionDAG &DAG,
02162                                    SmallVectorImpl<SDValue> &InVals) const {
02163 
02164   // Assign locations to each value returned by this call.
02165   SmallVector<CCValAssign, 16> RVLocs;
02166   bool Is64Bit = Subtarget->is64Bit();
02167   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02168                  *DAG.getContext());
02169   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02170 
02171   // Copy all of the result registers out of their specified physreg.
02172   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02173     CCValAssign &VA = RVLocs[i];
02174     EVT CopyVT = VA.getValVT();
02175 
02176     // If this is x86-64, and we disabled SSE, we can't return FP values
02177     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02178         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02179       report_fatal_error("SSE register return with SSE disabled");
02180     }
02181 
02182     // If we prefer to use the value in xmm registers, copy it out as f80 and
02183     // use a truncate to move it from fp stack reg to xmm reg.
02184     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02185         isScalarFPTypeInSSEReg(VA.getValVT()))
02186       CopyVT = MVT::f80;
02187 
02188     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02189                                CopyVT, InFlag).getValue(1);
02190     SDValue Val = Chain.getValue(0);
02191 
02192     if (CopyVT != VA.getValVT())
02193       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02194                         // This truncation won't change the value.
02195                         DAG.getIntPtrConstant(1));
02196 
02197     InFlag = Chain.getValue(2);
02198     InVals.push_back(Val);
02199   }
02200 
02201   return Chain;
02202 }
02203 
02204 //===----------------------------------------------------------------------===//
02205 //                C & StdCall & Fast Calling Convention implementation
02206 //===----------------------------------------------------------------------===//
02207 //  StdCall calling convention seems to be standard for many Windows' API
02208 //  routines and around. It differs from C calling convention just a little:
02209 //  callee should clean up the stack, not caller. Symbols should be also
02210 //  decorated in some fancy way :) It doesn't support any vector arguments.
02211 //  For info on fast calling convention see Fast Calling Convention (tail call)
02212 //  implementation LowerX86_32FastCCCallTo.
02213 
02214 /// CallIsStructReturn - Determines whether a call uses struct return
02215 /// semantics.
02216 enum StructReturnType {
02217   NotStructReturn,
02218   RegStructReturn,
02219   StackStructReturn
02220 };
02221 static StructReturnType
02222 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02223   if (Outs.empty())
02224     return NotStructReturn;
02225 
02226   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02227   if (!Flags.isSRet())
02228     return NotStructReturn;
02229   if (Flags.isInReg())
02230     return RegStructReturn;
02231   return StackStructReturn;
02232 }
02233 
02234 /// Determines whether a function uses struct return semantics.
02235 static StructReturnType
02236 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02237   if (Ins.empty())
02238     return NotStructReturn;
02239 
02240   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02241   if (!Flags.isSRet())
02242     return NotStructReturn;
02243   if (Flags.isInReg())
02244     return RegStructReturn;
02245   return StackStructReturn;
02246 }
02247 
02248 /// Make a copy of an aggregate at address specified by "Src" to address
02249 /// "Dst" with size and alignment information specified by the specific
02250 /// parameter attribute. The copy will be passed as a byval function parameter.
02251 static SDValue
02252 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02253                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02254                           SDLoc dl) {
02255   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02256 
02257   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02258                        /*isVolatile*/false, /*AlwaysInline=*/true,
02259                        MachinePointerInfo(), MachinePointerInfo());
02260 }
02261 
02262 /// Return true if the calling convention is one that
02263 /// supports tail call optimization.
02264 static bool IsTailCallConvention(CallingConv::ID CC) {
02265   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02266           CC == CallingConv::HiPE);
02267 }
02268 
02269 /// \brief Return true if the calling convention is a C calling convention.
02270 static bool IsCCallConvention(CallingConv::ID CC) {
02271   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02272           CC == CallingConv::X86_64_SysV);
02273 }
02274 
02275 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02276   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02277     return false;
02278 
02279   CallSite CS(CI);
02280   CallingConv::ID CalleeCC = CS.getCallingConv();
02281   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02282     return false;
02283 
02284   return true;
02285 }
02286 
02287 /// Return true if the function is being made into
02288 /// a tailcall target by changing its ABI.
02289 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02290                                    bool GuaranteedTailCallOpt) {
02291   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02292 }
02293 
02294 SDValue
02295 X86TargetLowering::LowerMemArgument(SDValue Chain,
02296                                     CallingConv::ID CallConv,
02297                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02298                                     SDLoc dl, SelectionDAG &DAG,
02299                                     const CCValAssign &VA,
02300                                     MachineFrameInfo *MFI,
02301                                     unsigned i) const {
02302   // Create the nodes corresponding to a load from this parameter slot.
02303   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02304   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02305       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02306   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02307   EVT ValVT;
02308 
02309   // If value is passed by pointer we have address passed instead of the value
02310   // itself.
02311   if (VA.getLocInfo() == CCValAssign::Indirect)
02312     ValVT = VA.getLocVT();
02313   else
02314     ValVT = VA.getValVT();
02315 
02316   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02317   // changed with more analysis.
02318   // In case of tail call optimization mark all arguments mutable. Since they
02319   // could be overwritten by lowering of arguments in case of a tail call.
02320   if (Flags.isByVal()) {
02321     unsigned Bytes = Flags.getByValSize();
02322     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02323     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02324     return DAG.getFrameIndex(FI, getPointerTy());
02325   } else {
02326     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02327                                     VA.getLocMemOffset(), isImmutable);
02328     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02329     return DAG.getLoad(ValVT, dl, Chain, FIN,
02330                        MachinePointerInfo::getFixedStack(FI),
02331                        false, false, false, 0);
02332   }
02333 }
02334 
02335 // FIXME: Get this from tablegen.
02336 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02337                                                 const X86Subtarget *Subtarget) {
02338   assert(Subtarget->is64Bit());
02339 
02340   if (Subtarget->isCallingConvWin64(CallConv)) {
02341     static const MCPhysReg GPR64ArgRegsWin64[] = {
02342       X86::RCX, X86::RDX, X86::R8,  X86::R9
02343     };
02344     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02345   }
02346 
02347   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02348     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02349   };
02350   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02351 }
02352 
02353 // FIXME: Get this from tablegen.
02354 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02355                                                 CallingConv::ID CallConv,
02356                                                 const X86Subtarget *Subtarget) {
02357   assert(Subtarget->is64Bit());
02358   if (Subtarget->isCallingConvWin64(CallConv)) {
02359     // The XMM registers which might contain var arg parameters are shadowed
02360     // in their paired GPR.  So we only need to save the GPR to their home
02361     // slots.
02362     // TODO: __vectorcall will change this.
02363     return None;
02364   }
02365 
02366   const Function *Fn = MF.getFunction();
02367   bool NoImplicitFloatOps = Fn->getAttributes().
02368       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02369   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02370          "SSE register cannot be used when SSE is disabled!");
02371   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02372       !Subtarget->hasSSE1())
02373     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02374     // registers.
02375     return None;
02376 
02377   static const MCPhysReg XMMArgRegs64Bit[] = {
02378     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02379     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02380   };
02381   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02382 }
02383 
02384 SDValue
02385 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02386                                         CallingConv::ID CallConv,
02387                                         bool isVarArg,
02388                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02389                                         SDLoc dl,
02390                                         SelectionDAG &DAG,
02391                                         SmallVectorImpl<SDValue> &InVals)
02392                                           const {
02393   MachineFunction &MF = DAG.getMachineFunction();
02394   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02395 
02396   const Function* Fn = MF.getFunction();
02397   if (Fn->hasExternalLinkage() &&
02398       Subtarget->isTargetCygMing() &&
02399       Fn->getName() == "main")
02400     FuncInfo->setForceFramePointer(true);
02401 
02402   MachineFrameInfo *MFI = MF.getFrameInfo();
02403   bool Is64Bit = Subtarget->is64Bit();
02404   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02405 
02406   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02407          "Var args not supported with calling convention fastcc, ghc or hipe");
02408 
02409   // Assign locations to all of the incoming arguments.
02410   SmallVector<CCValAssign, 16> ArgLocs;
02411   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02412 
02413   // Allocate shadow area for Win64
02414   if (IsWin64)
02415     CCInfo.AllocateStack(32, 8);
02416 
02417   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02418 
02419   unsigned LastVal = ~0U;
02420   SDValue ArgValue;
02421   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02422     CCValAssign &VA = ArgLocs[i];
02423     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02424     // places.
02425     assert(VA.getValNo() != LastVal &&
02426            "Don't support value assigned to multiple locs yet");
02427     (void)LastVal;
02428     LastVal = VA.getValNo();
02429 
02430     if (VA.isRegLoc()) {
02431       EVT RegVT = VA.getLocVT();
02432       const TargetRegisterClass *RC;
02433       if (RegVT == MVT::i32)
02434         RC = &X86::GR32RegClass;
02435       else if (Is64Bit && RegVT == MVT::i64)
02436         RC = &X86::GR64RegClass;
02437       else if (RegVT == MVT::f32)
02438         RC = &X86::FR32RegClass;
02439       else if (RegVT == MVT::f64)
02440         RC = &X86::FR64RegClass;
02441       else if (RegVT.is512BitVector())
02442         RC = &X86::VR512RegClass;
02443       else if (RegVT.is256BitVector())
02444         RC = &X86::VR256RegClass;
02445       else if (RegVT.is128BitVector())
02446         RC = &X86::VR128RegClass;
02447       else if (RegVT == MVT::x86mmx)
02448         RC = &X86::VR64RegClass;
02449       else if (RegVT == MVT::i1)
02450         RC = &X86::VK1RegClass;
02451       else if (RegVT == MVT::v8i1)
02452         RC = &X86::VK8RegClass;
02453       else if (RegVT == MVT::v16i1)
02454         RC = &X86::VK16RegClass;
02455       else if (RegVT == MVT::v32i1)
02456         RC = &X86::VK32RegClass;
02457       else if (RegVT == MVT::v64i1)
02458         RC = &X86::VK64RegClass;
02459       else
02460         llvm_unreachable("Unknown argument type!");
02461 
02462       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02463       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02464 
02465       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02466       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02467       // right size.
02468       if (VA.getLocInfo() == CCValAssign::SExt)
02469         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02470                                DAG.getValueType(VA.getValVT()));
02471       else if (VA.getLocInfo() == CCValAssign::ZExt)
02472         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02473                                DAG.getValueType(VA.getValVT()));
02474       else if (VA.getLocInfo() == CCValAssign::BCvt)
02475         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02476 
02477       if (VA.isExtInLoc()) {
02478         // Handle MMX values passed in XMM regs.
02479         if (RegVT.isVector())
02480           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02481         else
02482           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02483       }
02484     } else {
02485       assert(VA.isMemLoc());
02486       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02487     }
02488 
02489     // If value is passed via pointer - do a load.
02490     if (VA.getLocInfo() == CCValAssign::Indirect)
02491       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02492                              MachinePointerInfo(), false, false, false, 0);
02493 
02494     InVals.push_back(ArgValue);
02495   }
02496 
02497   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02498     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02499       // The x86-64 ABIs require that for returning structs by value we copy
02500       // the sret argument into %rax/%eax (depending on ABI) for the return.
02501       // Win32 requires us to put the sret argument to %eax as well.
02502       // Save the argument into a virtual register so that we can access it
02503       // from the return points.
02504       if (Ins[i].Flags.isSRet()) {
02505         unsigned Reg = FuncInfo->getSRetReturnReg();
02506         if (!Reg) {
02507           MVT PtrTy = getPointerTy();
02508           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02509           FuncInfo->setSRetReturnReg(Reg);
02510         }
02511         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02512         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02513         break;
02514       }
02515     }
02516   }
02517 
02518   unsigned StackSize = CCInfo.getNextStackOffset();
02519   // Align stack specially for tail calls.
02520   if (FuncIsMadeTailCallSafe(CallConv,
02521                              MF.getTarget().Options.GuaranteedTailCallOpt))
02522     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02523 
02524   // If the function takes variable number of arguments, make a frame index for
02525   // the start of the first vararg value... for expansion of llvm.va_start. We
02526   // can skip this if there are no va_start calls.
02527   if (MFI->hasVAStart() &&
02528       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02529                    CallConv != CallingConv::X86_ThisCall))) {
02530     FuncInfo->setVarArgsFrameIndex(
02531         MFI->CreateFixedObject(1, StackSize, true));
02532   }
02533 
02534   // 64-bit calling conventions support varargs and register parameters, so we
02535   // have to do extra work to spill them in the prologue or forward them to
02536   // musttail calls.
02537   if (Is64Bit && isVarArg &&
02538       (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
02539     // Find the first unallocated argument registers.
02540     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02541     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02542     unsigned NumIntRegs =
02543         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02544     unsigned NumXMMRegs =
02545         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02546     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02547            "SSE register cannot be used when SSE is disabled!");
02548 
02549     // Gather all the live in physical registers.
02550     SmallVector<SDValue, 6> LiveGPRs;
02551     SmallVector<SDValue, 8> LiveXMMRegs;
02552     SDValue ALVal;
02553     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02554       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02555       LiveGPRs.push_back(
02556           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02557     }
02558     if (!ArgXMMs.empty()) {
02559       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02560       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02561       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02562         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02563         LiveXMMRegs.push_back(
02564             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02565       }
02566     }
02567 
02568     // Store them to the va_list returned by va_start.
02569     if (MFI->hasVAStart()) {
02570       if (IsWin64) {
02571         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02572         // Get to the caller-allocated home save location.  Add 8 to account
02573         // for the return address.
02574         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02575         FuncInfo->setRegSaveFrameIndex(
02576           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02577         // Fixup to set vararg frame on shadow area (4 x i64).
02578         if (NumIntRegs < 4)
02579           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02580       } else {
02581         // For X86-64, if there are vararg parameters that are passed via
02582         // registers, then we must store them to their spots on the stack so
02583         // they may be loaded by deferencing the result of va_next.
02584         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02585         FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02586         FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02587             ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02588       }
02589 
02590       // Store the integer parameter registers.
02591       SmallVector<SDValue, 8> MemOps;
02592       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02593                                         getPointerTy());
02594       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02595       for (SDValue Val : LiveGPRs) {
02596         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02597                                   DAG.getIntPtrConstant(Offset));
02598         SDValue Store =
02599           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02600                        MachinePointerInfo::getFixedStack(
02601                          FuncInfo->getRegSaveFrameIndex(), Offset),
02602                        false, false, 0);
02603         MemOps.push_back(Store);
02604         Offset += 8;
02605       }
02606 
02607       if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02608         // Now store the XMM (fp + vector) parameter registers.
02609         SmallVector<SDValue, 12> SaveXMMOps;
02610         SaveXMMOps.push_back(Chain);
02611         SaveXMMOps.push_back(ALVal);
02612         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02613                                FuncInfo->getRegSaveFrameIndex()));
02614         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02615                                FuncInfo->getVarArgsFPOffset()));
02616         SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02617                           LiveXMMRegs.end());
02618         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02619                                      MVT::Other, SaveXMMOps));
02620       }
02621 
02622       if (!MemOps.empty())
02623         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02624     } else {
02625       // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
02626       // to the liveout set on a musttail call.
02627       assert(MFI->hasMustTailInVarArgFunc());
02628       auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
02629       typedef X86MachineFunctionInfo::Forward Forward;
02630 
02631       for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
02632         unsigned VReg =
02633             MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
02634         Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
02635         Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
02636       }
02637 
02638       if (!ArgXMMs.empty()) {
02639         unsigned ALVReg =
02640             MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
02641         Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
02642         Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
02643 
02644         for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
02645           unsigned VReg =
02646               MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
02647           Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
02648           Forwards.push_back(
02649               Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
02650         }
02651       }
02652     }
02653   }
02654 
02655   // Some CCs need callee pop.
02656   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02657                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02658     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02659   } else {
02660     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02661     // If this is an sret function, the return should pop the hidden pointer.
02662     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02663         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02664         argsAreStructReturn(Ins) == StackStructReturn)
02665       FuncInfo->setBytesToPopOnReturn(4);
02666   }
02667 
02668   if (!Is64Bit) {
02669     // RegSaveFrameIndex is X86-64 only.
02670     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02671     if (CallConv == CallingConv::X86_FastCall ||
02672         CallConv == CallingConv::X86_ThisCall)
02673       // fastcc functions can't have varargs.
02674       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02675   }
02676 
02677   FuncInfo->setArgumentStackSize(StackSize);
02678 
02679   return Chain;
02680 }
02681 
02682 SDValue
02683 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02684                                     SDValue StackPtr, SDValue Arg,
02685                                     SDLoc dl, SelectionDAG &DAG,
02686                                     const CCValAssign &VA,
02687                                     ISD::ArgFlagsTy Flags) const {
02688   unsigned LocMemOffset = VA.getLocMemOffset();
02689   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02690   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02691   if (Flags.isByVal())
02692     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02693 
02694   return DAG.getStore(Chain, dl, Arg, PtrOff,
02695                       MachinePointerInfo::getStack(LocMemOffset),
02696                       false, false, 0);
02697 }
02698 
02699 /// Emit a load of return address if tail call
02700 /// optimization is performed and it is required.
02701 SDValue
02702 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02703                                            SDValue &OutRetAddr, SDValue Chain,
02704                                            bool IsTailCall, bool Is64Bit,
02705                                            int FPDiff, SDLoc dl) const {
02706   // Adjust the Return address stack slot.
02707   EVT VT = getPointerTy();
02708   OutRetAddr = getReturnAddressFrameIndex(DAG);
02709 
02710   // Load the "old" Return address.
02711   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02712                            false, false, false, 0);
02713   return SDValue(OutRetAddr.getNode(), 1);
02714 }
02715 
02716 /// Emit a store of the return address if tail call
02717 /// optimization is performed and it is required (FPDiff!=0).
02718 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02719                                         SDValue Chain, SDValue RetAddrFrIdx,
02720                                         EVT PtrVT, unsigned SlotSize,
02721                                         int FPDiff, SDLoc dl) {
02722   // Store the return address to the appropriate stack slot.
02723   if (!FPDiff) return Chain;
02724   // Calculate the new stack slot for the return address.
02725   int NewReturnAddrFI =
02726     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02727                                          false);
02728   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02729   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02730                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02731                        false, false, 0);
02732   return Chain;
02733 }
02734 
02735 SDValue
02736 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02737                              SmallVectorImpl<SDValue> &InVals) const {
02738   SelectionDAG &DAG                     = CLI.DAG;
02739   SDLoc &dl                             = CLI.DL;
02740   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02741   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02742   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02743   SDValue Chain                         = CLI.Chain;
02744   SDValue Callee                        = CLI.Callee;
02745   CallingConv::ID CallConv              = CLI.CallConv;
02746   bool &isTailCall                      = CLI.IsTailCall;
02747   bool isVarArg                         = CLI.IsVarArg;
02748 
02749   MachineFunction &MF = DAG.getMachineFunction();
02750   bool Is64Bit        = Subtarget->is64Bit();
02751   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02752   StructReturnType SR = callIsStructReturn(Outs);
02753   bool IsSibcall      = false;
02754   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02755 
02756   if (MF.getTarget().Options.DisableTailCalls)
02757     isTailCall = false;
02758 
02759   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02760   if (IsMustTail) {
02761     // Force this to be a tail call.  The verifier rules are enough to ensure
02762     // that we can lower this successfully without moving the return address
02763     // around.
02764     isTailCall = true;
02765   } else if (isTailCall) {
02766     // Check if it's really possible to do a tail call.
02767     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02768                     isVarArg, SR != NotStructReturn,
02769                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02770                     Outs, OutVals, Ins, DAG);
02771 
02772     // Sibcalls are automatically detected tailcalls which do not require
02773     // ABI changes.
02774     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02775       IsSibcall = true;
02776 
02777     if (isTailCall)
02778       ++NumTailCalls;
02779   }
02780 
02781   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02782          "Var args not supported with calling convention fastcc, ghc or hipe");
02783 
02784   // Analyze operands of the call, assigning locations to each operand.
02785   SmallVector<CCValAssign, 16> ArgLocs;
02786   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02787 
02788   // Allocate shadow area for Win64
02789   if (IsWin64)
02790     CCInfo.AllocateStack(32, 8);
02791 
02792   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02793 
02794   // Get a count of how many bytes are to be pushed on the stack.
02795   unsigned NumBytes = CCInfo.getNextStackOffset();
02796   if (IsSibcall)
02797     // This is a sibcall. The memory operands are available in caller's
02798     // own caller's stack.
02799     NumBytes = 0;
02800   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02801            IsTailCallConvention(CallConv))
02802     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02803 
02804   int FPDiff = 0;
02805   if (isTailCall && !IsSibcall && !IsMustTail) {
02806     // Lower arguments at fp - stackoffset + fpdiff.
02807     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02808 
02809     FPDiff = NumBytesCallerPushed - NumBytes;
02810 
02811     // Set the delta of movement of the returnaddr stackslot.
02812     // But only set if delta is greater than previous delta.
02813     if (FPDiff < X86Info->getTCReturnAddrDelta())
02814       X86Info->setTCReturnAddrDelta(FPDiff);
02815   }
02816 
02817   unsigned NumBytesToPush = NumBytes;
02818   unsigned NumBytesToPop = NumBytes;
02819 
02820   // If we have an inalloca argument, all stack space has already been allocated
02821   // for us and be right at the top of the stack.  We don't support multiple
02822   // arguments passed in memory when using inalloca.
02823   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02824     NumBytesToPush = 0;
02825     if (!ArgLocs.back().isMemLoc())
02826       report_fatal_error("cannot use inalloca attribute on a register "
02827                          "parameter");
02828     if (ArgLocs.back().getLocMemOffset() != 0)
02829       report_fatal_error("any parameter with the inalloca attribute must be "
02830                          "the only memory argument");
02831   }
02832 
02833   if (!IsSibcall)
02834     Chain = DAG.getCALLSEQ_START(
02835         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02836 
02837   SDValue RetAddrFrIdx;
02838   // Load return address for tail calls.
02839   if (isTailCall && FPDiff)
02840     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02841                                     Is64Bit, FPDiff, dl);
02842 
02843   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02844   SmallVector<SDValue, 8> MemOpChains;
02845   SDValue StackPtr;
02846 
02847   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02848   // of tail call optimization arguments are handle later.
02849   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02850       DAG.getSubtarget().getRegisterInfo());
02851   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02852     // Skip inalloca arguments, they have already been written.
02853     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02854     if (Flags.isInAlloca())
02855       continue;
02856 
02857     CCValAssign &VA = ArgLocs[i];
02858     EVT RegVT = VA.getLocVT();
02859     SDValue Arg = OutVals[i];
02860     bool isByVal = Flags.isByVal();
02861 
02862     // Promote the value if needed.
02863     switch (VA.getLocInfo()) {
02864     default: llvm_unreachable("Unknown loc info!");
02865     case CCValAssign::Full: break;
02866     case CCValAssign::SExt:
02867       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02868       break;
02869     case CCValAssign::ZExt:
02870       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02871       break;
02872     case CCValAssign::AExt:
02873       if (RegVT.is128BitVector()) {
02874         // Special case: passing MMX values in XMM registers.
02875         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02876         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02877         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02878       } else
02879         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02880       break;
02881     case CCValAssign::BCvt:
02882       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02883       break;
02884     case CCValAssign::Indirect: {
02885       // Store the argument.
02886       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02887       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02888       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02889                            MachinePointerInfo::getFixedStack(FI),
02890                            false, false, 0);
02891       Arg = SpillSlot;
02892       break;
02893     }
02894     }
02895 
02896     if (VA.isRegLoc()) {
02897       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02898       if (isVarArg && IsWin64) {
02899         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02900         // shadow reg if callee is a varargs function.
02901         unsigned ShadowReg = 0;
02902         switch (VA.getLocReg()) {
02903         case X86::XMM0: ShadowReg = X86::RCX; break;
02904         case X86::XMM1: ShadowReg = X86::RDX; break;
02905         case X86::XMM2: ShadowReg = X86::R8; break;
02906         case X86::XMM3: ShadowReg = X86::R9; break;
02907         }
02908         if (ShadowReg)
02909           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02910       }
02911     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02912       assert(VA.isMemLoc());
02913       if (!StackPtr.getNode())
02914         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02915                                       getPointerTy());
02916       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02917                                              dl, DAG, VA, Flags));
02918     }
02919   }
02920 
02921   if (!MemOpChains.empty())
02922     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02923 
02924   if (Subtarget->isPICStyleGOT()) {
02925     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02926     // GOT pointer.
02927     if (!isTailCall) {
02928       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02929                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02930     } else {
02931       // If we are tail calling and generating PIC/GOT style code load the
02932       // address of the callee into ECX. The value in ecx is used as target of
02933       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02934       // for tail calls on PIC/GOT architectures. Normally we would just put the
02935       // address of GOT into ebx and then call target@PLT. But for tail calls
02936       // ebx would be restored (since ebx is callee saved) before jumping to the
02937       // target@PLT.
02938 
02939       // Note: The actual moving to ECX is done further down.
02940       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02941       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02942           !G->getGlobal()->hasProtectedVisibility())
02943         Callee = LowerGlobalAddress(Callee, DAG);
02944       else if (isa<ExternalSymbolSDNode>(Callee))
02945         Callee = LowerExternalSymbol(Callee, DAG);
02946     }
02947   }
02948 
02949   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02950     // From AMD64 ABI document:
02951     // For calls that may call functions that use varargs or stdargs
02952     // (prototype-less calls or calls to functions containing ellipsis (...) in
02953     // the declaration) %al is used as hidden argument to specify the number
02954     // of SSE registers used. The contents of %al do not need to match exactly
02955     // the number of registers, but must be an ubound on the number of SSE
02956     // registers used and is in the range 0 - 8 inclusive.
02957 
02958     // Count the number of XMM registers allocated.
02959     static const MCPhysReg XMMArgRegs[] = {
02960       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02961       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02962     };
02963     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02964     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02965            && "SSE registers cannot be used when SSE is disabled");
02966 
02967     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02968                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02969   }
02970 
02971   if (Is64Bit && isVarArg && IsMustTail) {
02972     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02973     for (const auto &F : Forwards) {
02974       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02975       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02976     }
02977   }
02978 
02979   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02980   // don't need this because the eligibility check rejects calls that require
02981   // shuffling arguments passed in memory.
02982   if (!IsSibcall && isTailCall) {
02983     // Force all the incoming stack arguments to be loaded from the stack
02984     // before any new outgoing arguments are stored to the stack, because the
02985     // outgoing stack slots may alias the incoming argument stack slots, and
02986     // the alias isn't otherwise explicit. This is slightly more conservative
02987     // than necessary, because it means that each store effectively depends
02988     // on every argument instead of just those arguments it would clobber.
02989     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02990 
02991     SmallVector<SDValue, 8> MemOpChains2;
02992     SDValue FIN;
02993     int FI = 0;
02994     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02995       CCValAssign &VA = ArgLocs[i];
02996       if (VA.isRegLoc())
02997         continue;
02998       assert(VA.isMemLoc());
02999       SDValue Arg = OutVals[i];
03000       ISD::ArgFlagsTy Flags = Outs[i].Flags;
03001       // Skip inalloca arguments.  They don't require any work.
03002       if (Flags.isInAlloca())
03003         continue;
03004       // Create frame index.
03005       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03006       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03007       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03008       FIN = DAG.getFrameIndex(FI, getPointerTy());
03009 
03010       if (Flags.isByVal()) {
03011         // Copy relative to framepointer.
03012         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03013         if (!StackPtr.getNode())
03014           StackPtr = DAG.getCopyFromReg(Chain, dl,
03015                                         RegInfo->getStackRegister(),
03016                                         getPointerTy());
03017         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03018 
03019         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03020                                                          ArgChain,
03021                                                          Flags, DAG, dl));
03022       } else {
03023         // Store relative to framepointer.
03024         MemOpChains2.push_back(
03025           DAG.getStore(ArgChain, dl, Arg, FIN,
03026                        MachinePointerInfo::getFixedStack(FI),
03027                        false, false, 0));
03028       }
03029     }
03030 
03031     if (!MemOpChains2.empty())
03032       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03033 
03034     // Store the return address to the appropriate stack slot.
03035     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03036                                      getPointerTy(), RegInfo->getSlotSize(),
03037                                      FPDiff, dl);
03038   }
03039 
03040   // Build a sequence of copy-to-reg nodes chained together with token chain
03041   // and flag operands which copy the outgoing args into registers.
03042   SDValue InFlag;
03043   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03044     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03045                              RegsToPass[i].second, InFlag);
03046     InFlag = Chain.getValue(1);
03047   }
03048 
03049   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03050     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03051     // In the 64-bit large code model, we have to make all calls
03052     // through a register, since the call instruction's 32-bit
03053     // pc-relative offset may not be large enough to hold the whole
03054     // address.
03055   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
03056     // If the callee is a GlobalAddress node (quite common, every direct call
03057     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03058     // it.
03059 
03060     // We should use extra load for direct calls to dllimported functions in
03061     // non-JIT mode.
03062     const GlobalValue *GV = G->getGlobal();
03063     if (!GV->hasDLLImportStorageClass()) {
03064       unsigned char OpFlags = 0;
03065       bool ExtraLoad = false;
03066       unsigned WrapperKind = ISD::DELETED_NODE;
03067 
03068       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03069       // external symbols most go through the PLT in PIC mode.  If the symbol
03070       // has hidden or protected visibility, or if it is static or local, then
03071       // we don't need to use the PLT - we can directly call it.
03072       if (Subtarget->isTargetELF() &&
03073           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03074           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03075         OpFlags = X86II::MO_PLT;
03076       } else if (Subtarget->isPICStyleStubAny() &&
03077                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03078                  (!Subtarget->getTargetTriple().isMacOSX() ||
03079                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03080         // PC-relative references to external symbols should go through $stub,
03081         // unless we're building with the leopard linker or later, which
03082         // automatically synthesizes these stubs.
03083         OpFlags = X86II::MO_DARWIN_STUB;
03084       } else if (Subtarget->isPICStyleRIPRel() &&
03085                  isa<Function>(GV) &&
03086                  cast<Function>(GV)->getAttributes().
03087                    hasAttribute(AttributeSet::FunctionIndex,
03088                                 Attribute::NonLazyBind)) {
03089         // If the function is marked as non-lazy, generate an indirect call
03090         // which loads from the GOT directly. This avoids runtime overhead
03091         // at the cost of eager binding (and one extra byte of encoding).
03092         OpFlags = X86II::MO_GOTPCREL;
03093         WrapperKind = X86ISD::WrapperRIP;
03094         ExtraLoad = true;
03095       }
03096 
03097       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03098                                           G->getOffset(), OpFlags);
03099 
03100       // Add a wrapper if needed.
03101       if (WrapperKind != ISD::DELETED_NODE)
03102         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03103       // Add extra indirection if needed.
03104       if (ExtraLoad)
03105         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03106                              MachinePointerInfo::getGOT(),
03107                              false, false, false, 0);
03108     }
03109   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03110     unsigned char OpFlags = 0;
03111 
03112     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03113     // external symbols should go through the PLT.
03114     if (Subtarget->isTargetELF() &&
03115         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03116       OpFlags = X86II::MO_PLT;
03117     } else if (Subtarget->isPICStyleStubAny() &&
03118                (!Subtarget->getTargetTriple().isMacOSX() ||
03119                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03120       // PC-relative references to external symbols should go through $stub,
03121       // unless we're building with the leopard linker or later, which
03122       // automatically synthesizes these stubs.
03123       OpFlags = X86II::MO_DARWIN_STUB;
03124     }
03125 
03126     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03127                                          OpFlags);
03128   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
03129     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03130     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03131   }
03132 
03133   // Returns a chain & a flag for retval copy to use.
03134   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03135   SmallVector<SDValue, 8> Ops;
03136 
03137   if (!IsSibcall && isTailCall) {
03138     Chain = DAG.getCALLSEQ_END(Chain,
03139                                DAG.getIntPtrConstant(NumBytesToPop, true),
03140                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03141     InFlag = Chain.getValue(1);
03142   }
03143 
03144   Ops.push_back(Chain);
03145   Ops.push_back(Callee);
03146 
03147   if (isTailCall)
03148     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03149 
03150   // Add argument registers to the end of the list so that they are known live
03151   // into the call.
03152   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03153     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03154                                   RegsToPass[i].second.getValueType()));
03155 
03156   // Add a register mask operand representing the call-preserved registers.
03157   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03158   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03159   assert(Mask && "Missing call preserved mask for calling convention");
03160   Ops.push_back(DAG.getRegisterMask(Mask));
03161 
03162   if (InFlag.getNode())
03163     Ops.push_back(InFlag);
03164 
03165   if (isTailCall) {
03166     // We used to do:
03167     //// If this is the first return lowered for this function, add the regs
03168     //// to the liveout set for the function.
03169     // This isn't right, although it's probably harmless on x86; liveouts
03170     // should be computed from returns not tail calls.  Consider a void
03171     // function making a tail call to a function returning int.
03172     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03173   }
03174 
03175   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03176   InFlag = Chain.getValue(1);
03177 
03178   // Create the CALLSEQ_END node.
03179   unsigned NumBytesForCalleeToPop;
03180   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03181                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03182     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03183   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03184            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03185            SR == StackStructReturn)
03186     // If this is a call to a struct-return function, the callee
03187     // pops the hidden struct pointer, so we have to push it back.
03188     // This is common for Darwin/X86, Linux & Mingw32 targets.
03189     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03190     NumBytesForCalleeToPop = 4;
03191   else
03192     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03193 
03194   // Returns a flag for retval copy to use.
03195   if (!IsSibcall) {
03196     Chain = DAG.getCALLSEQ_END(Chain,
03197                                DAG.getIntPtrConstant(NumBytesToPop, true),
03198                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03199                                                      true),
03200                                InFlag, dl);
03201     InFlag = Chain.getValue(1);
03202   }
03203 
03204   // Handle result values, copying them out of physregs into vregs that we
03205   // return.
03206   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03207                          Ins, dl, DAG, InVals);
03208 }
03209 
03210 //===----------------------------------------------------------------------===//
03211 //                Fast Calling Convention (tail call) implementation
03212 //===----------------------------------------------------------------------===//
03213 
03214 //  Like std call, callee cleans arguments, convention except that ECX is
03215 //  reserved for storing the tail called function address. Only 2 registers are
03216 //  free for argument passing (inreg). Tail call optimization is performed
03217 //  provided:
03218 //                * tailcallopt is enabled
03219 //                * caller/callee are fastcc
03220 //  On X86_64 architecture with GOT-style position independent code only local
03221 //  (within module) calls are supported at the moment.
03222 //  To keep the stack aligned according to platform abi the function
03223 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03224 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03225 //  If a tail called function callee has more arguments than the caller the
03226 //  caller needs to make sure that there is room to move the RETADDR to. This is
03227 //  achieved by reserving an area the size of the argument delta right after the
03228 //  original RETADDR, but before the saved framepointer or the spilled registers
03229 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03230 //  stack layout:
03231 //    arg1
03232 //    arg2
03233 //    RETADDR
03234 //    [ new RETADDR
03235 //      move area ]
03236 //    (possible EBP)
03237 //    ESI
03238 //    EDI
03239 //    local1 ..
03240 
03241 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03242 /// for a 16 byte align requirement.
03243 unsigned
03244 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03245                                                SelectionDAG& DAG) const {
03246   MachineFunction &MF = DAG.getMachineFunction();
03247   const TargetMachine &TM = MF.getTarget();
03248   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03249       TM.getSubtargetImpl()->getRegisterInfo());
03250   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03251   unsigned StackAlignment = TFI.getStackAlignment();
03252   uint64_t AlignMask = StackAlignment - 1;
03253   int64_t Offset = StackSize;
03254   unsigned SlotSize = RegInfo->getSlotSize();
03255   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03256     // Number smaller than 12 so just add the difference.
03257     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03258   } else {
03259     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03260     Offset = ((~AlignMask) & Offset) + StackAlignment +
03261       (StackAlignment-SlotSize);
03262   }
03263   return Offset;
03264 }
03265 
03266 /// MatchingStackOffset - Return true if the given stack call argument is
03267 /// already available in the same position (relatively) of the caller's
03268 /// incoming argument stack.
03269 static
03270 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03271                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03272                          const X86InstrInfo *TII) {
03273   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03274   int FI = INT_MAX;
03275   if (Arg.getOpcode() == ISD::CopyFromReg) {
03276     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03277     if (!TargetRegisterInfo::isVirtualRegister(VR))
03278       return false;
03279     MachineInstr *Def = MRI->getVRegDef(VR);
03280     if (!Def)
03281       return false;
03282     if (!Flags.isByVal()) {
03283       if (!TII->isLoadFromStackSlot(Def, FI))
03284         return false;
03285     } else {
03286       unsigned Opcode = Def->getOpcode();
03287       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03288           Def->getOperand(1).isFI()) {
03289         FI = Def->getOperand(1).getIndex();
03290         Bytes = Flags.getByValSize();
03291       } else
03292         return false;
03293     }
03294   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03295     if (Flags.isByVal())
03296       // ByVal argument is passed in as a pointer but it's now being
03297       // dereferenced. e.g.
03298       // define @foo(%struct.X* %A) {
03299       //   tail call @bar(%struct.X* byval %A)
03300       // }
03301       return false;
03302     SDValue Ptr = Ld->getBasePtr();
03303     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03304     if (!FINode)
03305       return false;
03306     FI = FINode->getIndex();
03307   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03308     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03309     FI = FINode->getIndex();
03310     Bytes = Flags.getByValSize();
03311   } else
03312     return false;
03313 
03314   assert(FI != INT_MAX);
03315   if (!MFI->isFixedObjectIndex(FI))
03316     return false;
03317   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03318 }
03319 
03320 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03321 /// for tail call optimization. Targets which want to do tail call
03322 /// optimization should implement this function.
03323 bool
03324 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03325                                                      CallingConv::ID CalleeCC,
03326                                                      bool isVarArg,
03327                                                      bool isCalleeStructRet,
03328                                                      bool isCallerStructRet,
03329                                                      Type *RetTy,
03330                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03331                                     const SmallVectorImpl<SDValue> &OutVals,
03332                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03333                                                      SelectionDAG &DAG) const {
03334   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03335     return false;
03336 
03337   // If -tailcallopt is specified, make fastcc functions tail-callable.
03338   const MachineFunction &MF = DAG.getMachineFunction();
03339   const Function *CallerF = MF.getFunction();
03340 
03341   // If the function return type is x86_fp80 and the callee return type is not,
03342   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03343   // perform a tailcall optimization here.
03344   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03345     return false;
03346 
03347   CallingConv::ID CallerCC = CallerF->getCallingConv();
03348   bool CCMatch = CallerCC == CalleeCC;
03349   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03350   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03351 
03352   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03353     if (IsTailCallConvention(CalleeCC) && CCMatch)
03354       return true;
03355     return false;
03356   }
03357 
03358   // Look for obvious safe cases to perform tail call optimization that do not
03359   // require ABI changes. This is what gcc calls sibcall.
03360 
03361   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03362   // emit a special epilogue.
03363   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03364       DAG.getSubtarget().getRegisterInfo());
03365   if (RegInfo->needsStackRealignment(MF))
03366     return false;
03367 
03368   // Also avoid sibcall optimization if either caller or callee uses struct
03369   // return semantics.
03370   if (isCalleeStructRet || isCallerStructRet)
03371     return false;
03372 
03373   // An stdcall/thiscall caller is expected to clean up its arguments; the
03374   // callee isn't going to do that.
03375   // FIXME: this is more restrictive than needed. We could produce a tailcall
03376   // when the stack adjustment matches. For example, with a thiscall that takes
03377   // only one argument.
03378   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03379                    CallerCC == CallingConv::X86_ThisCall))
03380     return false;
03381 
03382   // Do not sibcall optimize vararg calls unless all arguments are passed via
03383   // registers.
03384   if (isVarArg && !Outs.empty()) {
03385 
03386     // Optimizing for varargs on Win64 is unlikely to be safe without
03387     // additional testing.
03388     if (IsCalleeWin64 || IsCallerWin64)
03389       return false;
03390 
03391     SmallVector<CCValAssign, 16> ArgLocs;
03392     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03393                    *DAG.getContext());
03394 
03395     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03396     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03397       if (!ArgLocs[i].isRegLoc())
03398         return false;
03399   }
03400 
03401   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03402   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03403   // this into a sibcall.
03404   bool Unused = false;
03405   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03406     if (!Ins[i].Used) {
03407       Unused = true;
03408       break;
03409     }
03410   }
03411   if (Unused) {
03412     SmallVector<CCValAssign, 16> RVLocs;
03413     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03414                    *DAG.getContext());
03415     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03416     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03417       CCValAssign &VA = RVLocs[i];
03418       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03419         return false;
03420     }
03421   }
03422 
03423   // If the calling conventions do not match, then we'd better make sure the
03424   // results are returned in the same way as what the caller expects.
03425   if (!CCMatch) {
03426     SmallVector<CCValAssign, 16> RVLocs1;
03427     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03428                     *DAG.getContext());
03429     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03430 
03431     SmallVector<CCValAssign, 16> RVLocs2;
03432     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03433                     *DAG.getContext());
03434     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03435 
03436     if (RVLocs1.size() != RVLocs2.size())
03437       return false;
03438     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03439       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03440         return false;
03441       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03442         return false;
03443       if (RVLocs1[i].isRegLoc()) {
03444         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03445           return false;
03446       } else {
03447         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03448           return false;
03449       }
03450     }
03451   }
03452 
03453   // If the callee takes no arguments then go on to check the results of the
03454   // call.
03455   if (!Outs.empty()) {
03456     // Check if stack adjustment is needed. For now, do not do this if any
03457     // argument is passed on the stack.
03458     SmallVector<CCValAssign, 16> ArgLocs;
03459     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03460                    *DAG.getContext());
03461 
03462     // Allocate shadow area for Win64
03463     if (IsCalleeWin64)
03464       CCInfo.AllocateStack(32, 8);
03465 
03466     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03467     if (CCInfo.getNextStackOffset()) {
03468       MachineFunction &MF = DAG.getMachineFunction();
03469       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03470         return false;
03471 
03472       // Check if the arguments are already laid out in the right way as
03473       // the caller's fixed stack objects.
03474       MachineFrameInfo *MFI = MF.getFrameInfo();
03475       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03476       const X86InstrInfo *TII =
03477           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03478       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03479         CCValAssign &VA = ArgLocs[i];
03480         SDValue Arg = OutVals[i];
03481         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03482         if (VA.getLocInfo() == CCValAssign::Indirect)
03483           return false;
03484         if (!VA.isRegLoc()) {
03485           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03486                                    MFI, MRI, TII))
03487             return false;
03488         }
03489       }
03490     }
03491 
03492     // If the tailcall address may be in a register, then make sure it's
03493     // possible to register allocate for it. In 32-bit, the call address can
03494     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03495     // callee-saved registers are restored. These happen to be the same
03496     // registers used to pass 'inreg' arguments so watch out for those.
03497     if (!Subtarget->is64Bit() &&
03498         ((!isa<GlobalAddressSDNode>(Callee) &&
03499           !isa<ExternalSymbolSDNode>(Callee)) ||
03500          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03501       unsigned NumInRegs = 0;
03502       // In PIC we need an extra register to formulate the address computation
03503       // for the callee.
03504       unsigned MaxInRegs =
03505         (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03506 
03507       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03508         CCValAssign &VA = ArgLocs[i];
03509         if (!VA.isRegLoc())
03510           continue;
03511         unsigned Reg = VA.getLocReg();
03512         switch (Reg) {
03513         default: break;
03514         case X86::EAX: case X86::EDX: case X86::ECX:
03515           if (++NumInRegs == MaxInRegs)
03516             return false;
03517           break;
03518         }
03519       }
03520     }
03521   }
03522 
03523   return true;
03524 }
03525 
03526 FastISel *
03527 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03528                                   const TargetLibraryInfo *libInfo) const {
03529   return X86::createFastISel(funcInfo, libInfo);
03530 }
03531 
03532 //===----------------------------------------------------------------------===//
03533 //                           Other Lowering Hooks
03534 //===----------------------------------------------------------------------===//
03535 
03536 static bool MayFoldLoad(SDValue Op) {
03537   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03538 }
03539 
03540 static bool MayFoldIntoStore(SDValue Op) {
03541   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03542 }
03543 
03544 static bool isTargetShuffle(unsigned Opcode) {
03545   switch(Opcode) {
03546   default: return false;
03547   case X86ISD::BLENDI:
03548   case X86ISD::PSHUFB:
03549   case X86ISD::PSHUFD:
03550   case X86ISD::PSHUFHW:
03551   case X86ISD::PSHUFLW:
03552   case X86ISD::SHUFP:
03553   case X86ISD::PALIGNR:
03554   case X86ISD::MOVLHPS:
03555   case X86ISD::MOVLHPD:
03556   case X86ISD::MOVHLPS:
03557   case X86ISD::MOVLPS:
03558   case X86ISD::MOVLPD:
03559   case X86ISD::MOVSHDUP:
03560   case X86ISD::MOVSLDUP:
03561   case X86ISD::MOVDDUP:
03562   case X86ISD::MOVSS:
03563   case X86ISD::MOVSD:
03564   case X86ISD::UNPCKL:
03565   case X86ISD::UNPCKH:
03566   case X86ISD::VPERMILPI:
03567   case X86ISD::VPERM2X128:
03568   case X86ISD::VPERMI:
03569     return true;
03570   }
03571 }
03572 
03573 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03574                                     SDValue V1, SelectionDAG &DAG) {
03575   switch(Opc) {
03576   default: llvm_unreachable("Unknown x86 shuffle node");
03577   case X86ISD::MOVSHDUP:
03578   case X86ISD::MOVSLDUP:
03579   case X86ISD::MOVDDUP:
03580     return DAG.getNode(Opc, dl, VT, V1);
03581   }
03582 }
03583 
03584 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03585                                     SDValue V1, unsigned TargetMask,
03586                                     SelectionDAG &DAG) {
03587   switch(Opc) {
03588   default: llvm_unreachable("Unknown x86 shuffle node");
03589   case X86ISD::PSHUFD:
03590   case X86ISD::PSHUFHW:
03591   case X86ISD::PSHUFLW:
03592   case X86ISD::VPERMILPI:
03593   case X86ISD::VPERMI:
03594     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03595   }
03596 }
03597 
03598 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03599                                     SDValue V1, SDValue V2, unsigned TargetMask,
03600                                     SelectionDAG &DAG) {
03601   switch(Opc) {
03602   default: llvm_unreachable("Unknown x86 shuffle node");
03603   case X86ISD::PALIGNR:
03604   case X86ISD::VALIGN:
03605   case X86ISD::SHUFP:
03606   case X86ISD::VPERM2X128:
03607     return DAG.getNode(Opc, dl, VT, V1, V2,
03608                        DAG.getConstant(TargetMask, MVT::i8));
03609   }
03610 }
03611 
03612 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03613                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03614   switch(Opc) {
03615   default: llvm_unreachable("Unknown x86 shuffle node");
03616   case X86ISD::MOVLHPS:
03617   case X86ISD::MOVLHPD:
03618   case X86ISD::MOVHLPS:
03619   case X86ISD::MOVLPS:
03620   case X86ISD::MOVLPD:
03621   case X86ISD::MOVSS:
03622   case X86ISD::MOVSD:
03623   case X86ISD::UNPCKL:
03624   case X86ISD::UNPCKH:
03625     return DAG.getNode(Opc, dl, VT, V1, V2);
03626   }
03627 }
03628 
03629 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03630   MachineFunction &MF = DAG.getMachineFunction();
03631   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03632       DAG.getSubtarget().getRegisterInfo());
03633   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03634   int ReturnAddrIndex = FuncInfo->getRAIndex();
03635 
03636   if (ReturnAddrIndex == 0) {
03637     // Set up a frame object for the return address.
03638     unsigned SlotSize = RegInfo->getSlotSize();
03639     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03640                                                            -(int64_t)SlotSize,
03641                                                            false);
03642     FuncInfo->setRAIndex(ReturnAddrIndex);
03643   }
03644 
03645   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03646 }
03647 
03648 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03649                                        bool hasSymbolicDisplacement) {
03650   // Offset should fit into 32 bit immediate field.
03651   if (!isInt<32>(Offset))
03652     return false;
03653 
03654   // If we don't have a symbolic displacement - we don't have any extra
03655   // restrictions.
03656   if (!hasSymbolicDisplacement)
03657     return true;
03658 
03659   // FIXME: Some tweaks might be needed for medium code model.
03660   if (M != CodeModel::Small && M != CodeModel::Kernel)
03661     return false;
03662 
03663   // For small code model we assume that latest object is 16MB before end of 31
03664   // bits boundary. We may also accept pretty large negative constants knowing
03665   // that all objects are in the positive half of address space.
03666   if (M == CodeModel::Small && Offset < 16*1024*1024)
03667     return true;
03668 
03669   // For kernel code model we know that all object resist in the negative half
03670   // of 32bits address space. We may not accept negative offsets, since they may
03671   // be just off and we may accept pretty large positive ones.
03672   if (M == CodeModel::Kernel && Offset >= 0)
03673     return true;
03674 
03675   return false;
03676 }
03677 
03678 /// isCalleePop - Determines whether the callee is required to pop its
03679 /// own arguments. Callee pop is necessary to support tail calls.
03680 bool X86::isCalleePop(CallingConv::ID CallingConv,
03681                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03682   switch (CallingConv) {
03683   default:
03684     return false;
03685   case CallingConv::X86_StdCall:
03686   case CallingConv::X86_FastCall:
03687   case CallingConv::X86_ThisCall:
03688     return !is64Bit;
03689   case CallingConv::Fast:
03690   case CallingConv::GHC:
03691   case CallingConv::HiPE:
03692     if (IsVarArg)
03693       return false;
03694     return TailCallOpt;
03695   }
03696 }
03697 
03698 /// \brief Return true if the condition is an unsigned comparison operation.
03699 static bool isX86CCUnsigned(unsigned X86CC) {
03700   switch (X86CC) {
03701   default: llvm_unreachable("Invalid integer condition!");
03702   case X86::COND_E:     return true;
03703   case X86::COND_G:     return false;
03704   case X86::COND_GE:    return false;
03705   case X86::COND_L:     return false;
03706   case X86::COND_LE:    return false;
03707   case X86::COND_NE:    return true;
03708   case X86::COND_B:     return true;
03709   case X86::COND_A:     return true;
03710   case X86::COND_BE:    return true;
03711   case X86::COND_AE:    return true;
03712   }
03713   llvm_unreachable("covered switch fell through?!");
03714 }
03715 
03716 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03717 /// specific condition code, returning the condition code and the LHS/RHS of the
03718 /// comparison to make.
03719 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03720                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03721   if (!isFP) {
03722     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03723       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03724         // X > -1   -> X == 0, jump !sign.
03725         RHS = DAG.getConstant(0, RHS.getValueType());
03726         return X86::COND_NS;
03727       }
03728       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03729         // X < 0   -> X == 0, jump on sign.
03730         return X86::COND_S;
03731       }
03732       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03733         // X < 1   -> X <= 0
03734         RHS = DAG.getConstant(0, RHS.getValueType());
03735         return X86::COND_LE;
03736       }
03737     }
03738 
03739     switch (SetCCOpcode) {
03740     default: llvm_unreachable("Invalid integer condition!");
03741     case ISD::SETEQ:  return X86::COND_E;
03742     case ISD::SETGT:  return X86::COND_G;
03743     case ISD::SETGE:  return X86::COND_GE;
03744     case ISD::SETLT:  return X86::COND_L;
03745     case ISD::SETLE:  return X86::COND_LE;
03746     case ISD::SETNE:  return X86::COND_NE;
03747     case ISD::SETULT: return X86::COND_B;
03748     case ISD::SETUGT: return X86::COND_A;
03749     case ISD::SETULE: return X86::COND_BE;
03750     case ISD::SETUGE: return X86::COND_AE;
03751     }
03752   }
03753 
03754   // First determine if it is required or is profitable to flip the operands.
03755 
03756   // If LHS is a foldable load, but RHS is not, flip the condition.
03757   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03758       !ISD::isNON_EXTLoad(RHS.getNode())) {
03759     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03760     std::swap(LHS, RHS);
03761   }
03762 
03763   switch (SetCCOpcode) {
03764   default: break;
03765   case ISD::SETOLT:
03766   case ISD::SETOLE:
03767   case ISD::SETUGT:
03768   case ISD::SETUGE:
03769     std::swap(LHS, RHS);
03770     break;
03771   }
03772 
03773   // On a floating point condition, the flags are set as follows:
03774   // ZF  PF  CF   op
03775   //  0 | 0 | 0 | X > Y
03776   //  0 | 0 | 1 | X < Y
03777   //  1 | 0 | 0 | X == Y
03778   //  1 | 1 | 1 | unordered
03779   switch (SetCCOpcode) {
03780   default: llvm_unreachable("Condcode should be pre-legalized away");
03781   case ISD::SETUEQ:
03782   case ISD::SETEQ:   return X86::COND_E;
03783   case ISD::SETOLT:              // flipped
03784   case ISD::SETOGT:
03785   case ISD::SETGT:   return X86::COND_A;
03786   case ISD::SETOLE:              // flipped
03787   case ISD::SETOGE:
03788   case ISD::SETGE:   return X86::COND_AE;
03789   case ISD::SETUGT:              // flipped
03790   case ISD::SETULT:
03791   case ISD::SETLT:   return X86::COND_B;
03792   case ISD::SETUGE:              // flipped
03793   case ISD::SETULE:
03794   case ISD::SETLE:   return X86::COND_BE;
03795   case ISD::SETONE:
03796   case ISD::SETNE:   return X86::COND_NE;
03797   case ISD::SETUO:   return X86::COND_P;
03798   case ISD::SETO:    return X86::COND_NP;
03799   case ISD::SETOEQ:
03800   case ISD::SETUNE:  return X86::COND_INVALID;
03801   }
03802 }
03803 
03804 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03805 /// code. Current x86 isa includes the following FP cmov instructions:
03806 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03807 static bool hasFPCMov(unsigned X86CC) {
03808   switch (X86CC) {
03809   default:
03810     return false;
03811   case X86::COND_B:
03812   case X86::COND_BE:
03813   case X86::COND_E:
03814   case X86::COND_P:
03815   case X86::COND_A:
03816   case X86::COND_AE:
03817   case X86::COND_NE:
03818   case X86::COND_NP:
03819     return true;
03820   }
03821 }
03822 
03823 /// isFPImmLegal - Returns true if the target can instruction select the
03824 /// specified FP immediate natively. If false, the legalizer will
03825 /// materialize the FP immediate as a load from a constant pool.
03826 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03827   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03828     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03829       return true;
03830   }
03831   return false;
03832 }
03833 
03834 /// \brief Returns true if it is beneficial to convert a load of a constant
03835 /// to just the constant itself.
03836 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03837                                                           Type *Ty) const {
03838   assert(Ty->isIntegerTy());
03839 
03840   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03841   if (BitSize == 0 || BitSize > 64)
03842     return false;
03843   return true;
03844 }
03845 
03846 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, 
03847                                                 unsigned Index) const {
03848   if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
03849     return false;
03850 
03851   return (Index == 0 || Index == ResVT.getVectorNumElements());
03852 }
03853 
03854 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03855 /// the specified range (L, H].
03856 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03857   return (Val < 0) || (Val >= Low && Val < Hi);
03858 }
03859 
03860 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03861 /// specified value.
03862 static bool isUndefOrEqual(int Val, int CmpVal) {
03863   return (Val < 0 || Val == CmpVal);
03864 }
03865 
03866 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03867 /// from position Pos and ending in Pos+Size, falls within the specified
03868 /// sequential range (L, L+Pos]. or is undef.
03869 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03870                                        unsigned Pos, unsigned Size, int Low) {
03871   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03872     if (!isUndefOrEqual(Mask[i], Low))
03873       return false;
03874   return true;
03875 }
03876 
03877 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03878 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
03879 /// operand - by default will match for first operand.
03880 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
03881                          bool TestSecondOperand = false) {
03882   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
03883       VT != MVT::v2f64 && VT != MVT::v2i64)
03884     return false;
03885 
03886   unsigned NumElems = VT.getVectorNumElements();
03887   unsigned Lo = TestSecondOperand ? NumElems : 0;
03888   unsigned Hi = Lo + NumElems;
03889 
03890   for (unsigned i = 0; i < NumElems; ++i)
03891     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
03892       return false;
03893 
03894   return true;
03895 }
03896 
03897 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03898 /// is suitable for input to PSHUFHW.
03899 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03900   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03901     return false;
03902 
03903   // Lower quadword copied in order or undef.
03904   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03905     return false;
03906 
03907   // Upper quadword shuffled.
03908   for (unsigned i = 4; i != 8; ++i)
03909     if (!isUndefOrInRange(Mask[i], 4, 8))
03910       return false;
03911 
03912   if (VT == MVT::v16i16) {
03913     // Lower quadword copied in order or undef.
03914     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03915       return false;
03916 
03917     // Upper quadword shuffled.
03918     for (unsigned i = 12; i != 16; ++i)
03919       if (!isUndefOrInRange(Mask[i], 12, 16))
03920         return false;
03921   }
03922 
03923   return true;
03924 }
03925 
03926 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03927 /// is suitable for input to PSHUFLW.
03928 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03929   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03930     return false;
03931 
03932   // Upper quadword copied in order.
03933   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03934     return false;
03935 
03936   // Lower quadword shuffled.
03937   for (unsigned i = 0; i != 4; ++i)
03938     if (!isUndefOrInRange(Mask[i], 0, 4))
03939       return false;
03940 
03941   if (VT == MVT::v16i16) {
03942     // Upper quadword copied in order.
03943     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03944       return false;
03945 
03946     // Lower quadword shuffled.
03947     for (unsigned i = 8; i != 12; ++i)
03948       if (!isUndefOrInRange(Mask[i], 8, 12))
03949         return false;
03950   }
03951 
03952   return true;
03953 }
03954 
03955 /// \brief Return true if the mask specifies a shuffle of elements that is
03956 /// suitable for input to intralane (palignr) or interlane (valign) vector
03957 /// right-shift.
03958 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
03959   unsigned NumElts = VT.getVectorNumElements();
03960   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
03961   unsigned NumLaneElts = NumElts/NumLanes;
03962 
03963   // Do not handle 64-bit element shuffles with palignr.
03964   if (NumLaneElts == 2)
03965     return false;
03966 
03967   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03968     unsigned i;
03969     for (i = 0; i != NumLaneElts; ++i) {
03970       if (Mask[i+l] >= 0)
03971         break;
03972     }
03973 
03974     // Lane is all undef, go to next lane
03975     if (i == NumLaneElts)
03976       continue;
03977 
03978     int Start = Mask[i+l];
03979 
03980     // Make sure its in this lane in one of the sources
03981     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03982         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03983       return false;
03984 
03985     // If not lane 0, then we must match lane 0
03986     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03987       return false;
03988 
03989     // Correct second source to be contiguous with first source
03990     if (Start >= (int)NumElts)
03991       Start -= NumElts - NumLaneElts;
03992 
03993     // Make sure we're shifting in the right direction.
03994     if (Start <= (int)(i+l))
03995       return false;
03996 
03997     Start -= i;
03998 
03999     // Check the rest of the elements to see if they are consecutive.
04000     for (++i; i != NumLaneElts; ++i) {
04001       int Idx = Mask[i+l];
04002 
04003       // Make sure its in this lane
04004       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
04005           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
04006         return false;
04007 
04008       // If not lane 0, then we must match lane 0
04009       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
04010         return false;
04011 
04012       if (Idx >= (int)NumElts)
04013         Idx -= NumElts - NumLaneElts;
04014 
04015       if (!isUndefOrEqual(Idx, Start+i))
04016         return false;
04017 
04018     }
04019   }
04020 
04021   return true;
04022 }
04023 
04024 /// \brief Return true if the node specifies a shuffle of elements that is
04025 /// suitable for input to PALIGNR.
04026 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04027                           const X86Subtarget *Subtarget) {
04028   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04029       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04030       VT.is512BitVector())
04031     // FIXME: Add AVX512BW.
04032     return false;
04033 
04034   return isAlignrMask(Mask, VT, false);
04035 }
04036 
04037 /// \brief Return true if the node specifies a shuffle of elements that is
04038 /// suitable for input to VALIGN.
04039 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04040                           const X86Subtarget *Subtarget) {
04041   // FIXME: Add AVX512VL.
04042   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04043     return false;
04044   return isAlignrMask(Mask, VT, true);
04045 }
04046 
04047 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04048 /// the two vector operands have swapped position.
04049 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04050                                      unsigned NumElems) {
04051   for (unsigned i = 0; i != NumElems; ++i) {
04052     int idx = Mask[i];
04053     if (idx < 0)
04054       continue;
04055     else if (idx < (int)NumElems)
04056       Mask[i] = idx + NumElems;
04057     else
04058       Mask[i] = idx - NumElems;
04059   }
04060 }
04061 
04062 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04063 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04064 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04065 /// reverse of what x86 shuffles want.
04066 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04067 
04068   unsigned NumElems = VT.getVectorNumElements();
04069   unsigned NumLanes = VT.getSizeInBits()/128;
04070   unsigned NumLaneElems = NumElems/NumLanes;
04071 
04072   if (NumLaneElems != 2 && NumLaneElems != 4)
04073     return false;
04074 
04075   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04076   bool symetricMaskRequired =
04077     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04078 
04079   // VSHUFPSY divides the resulting vector into 4 chunks.
04080   // The sources are also splitted into 4 chunks, and each destination
04081   // chunk must come from a different source chunk.
04082   //
04083   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04084   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04085   //
04086   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04087   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04088   //
04089   // VSHUFPDY divides the resulting vector into 4 chunks.
04090   // The sources are also splitted into 4 chunks, and each destination
04091   // chunk must come from a different source chunk.
04092   //
04093   //  SRC1 =>      X3       X2       X1       X0
04094   //  SRC2 =>      Y3       Y2       Y1       Y0
04095   //
04096   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04097   //
04098   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04099   unsigned HalfLaneElems = NumLaneElems/2;
04100   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04101     for (unsigned i = 0; i != NumLaneElems; ++i) {
04102       int Idx = Mask[i+l];
04103       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04104       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04105         return false;
04106       // For VSHUFPSY, the mask of the second half must be the same as the
04107       // first but with the appropriate offsets. This works in the same way as
04108       // VPERMILPS works with masks.
04109       if (!symetricMaskRequired || Idx < 0)
04110         continue;
04111       if (MaskVal[i] < 0) {
04112         MaskVal[i] = Idx - l;
04113         continue;
04114       }
04115       if ((signed)(Idx - l) != MaskVal[i])
04116         return false;
04117     }
04118   }
04119 
04120   return true;
04121 }
04122 
04123 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04124 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04125 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04126   if (!VT.is128BitVector())
04127     return false;
04128 
04129   unsigned NumElems = VT.getVectorNumElements();
04130 
04131   if (NumElems != 4)
04132     return false;
04133 
04134   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04135   return isUndefOrEqual(Mask[0], 6) &&
04136          isUndefOrEqual(Mask[1], 7) &&
04137          isUndefOrEqual(Mask[2], 2) &&
04138          isUndefOrEqual(Mask[3], 3);
04139 }
04140 
04141 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04142 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04143 /// <2, 3, 2, 3>
04144 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04145   if (!VT.is128BitVector())
04146     return false;
04147 
04148   unsigned NumElems = VT.getVectorNumElements();
04149 
04150   if (NumElems != 4)
04151     return false;
04152 
04153   return isUndefOrEqual(Mask[0], 2) &&
04154          isUndefOrEqual(Mask[1], 3) &&
04155          isUndefOrEqual(Mask[2], 2) &&
04156          isUndefOrEqual(Mask[3], 3);
04157 }
04158 
04159 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04160 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04161 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04162   if (!VT.is128BitVector())
04163     return false;
04164 
04165   unsigned NumElems = VT.getVectorNumElements();
04166 
04167   if (NumElems != 2 && NumElems != 4)
04168     return false;
04169 
04170   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04171     if (!isUndefOrEqual(Mask[i], i + NumElems))
04172       return false;
04173 
04174   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04175     if (!isUndefOrEqual(Mask[i], i))
04176       return false;
04177 
04178   return true;
04179 }
04180 
04181 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04182 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04183 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04184   if (!VT.is128BitVector())
04185     return false;
04186 
04187   unsigned NumElems = VT.getVectorNumElements();
04188 
04189   if (NumElems != 2 && NumElems != 4)
04190     return false;
04191 
04192   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04193     if (!isUndefOrEqual(Mask[i], i))
04194       return false;
04195 
04196   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04197     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04198       return false;
04199 
04200   return true;
04201 }
04202 
04203 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04204 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04205 /// i. e: If all but one element come from the same vector.
04206 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04207   // TODO: Deal with AVX's VINSERTPS
04208   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04209     return false;
04210 
04211   unsigned CorrectPosV1 = 0;
04212   unsigned CorrectPosV2 = 0;
04213   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04214     if (Mask[i] == -1) {
04215       ++CorrectPosV1;
04216       ++CorrectPosV2;
04217       continue;
04218     }
04219 
04220     if (Mask[i] == i)
04221       ++CorrectPosV1;
04222     else if (Mask[i] == i + 4)
04223       ++CorrectPosV2;
04224   }
04225 
04226   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04227     // We have 3 elements (undefs count as elements from any vector) from one
04228     // vector, and one from another.
04229     return true;
04230 
04231   return false;
04232 }
04233 
04234 //
04235 // Some special combinations that can be optimized.
04236 //
04237 static
04238 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04239                                SelectionDAG &DAG) {
04240   MVT VT = SVOp->getSimpleValueType(0);
04241   SDLoc dl(SVOp);
04242 
04243   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04244     return SDValue();
04245 
04246   ArrayRef<int> Mask = SVOp->getMask();
04247 
04248   // These are the special masks that may be optimized.
04249   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04250   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04251   bool MatchEvenMask = true;
04252   bool MatchOddMask  = true;
04253   for (int i=0; i<8; ++i) {
04254     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04255       MatchEvenMask = false;
04256     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04257       MatchOddMask = false;
04258   }
04259 
04260   if (!MatchEvenMask && !MatchOddMask)
04261     return SDValue();
04262 
04263   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04264 
04265   SDValue Op0 = SVOp->getOperand(0);
04266   SDValue Op1 = SVOp->getOperand(1);
04267 
04268   if (MatchEvenMask) {
04269     // Shift the second operand right to 32 bits.
04270     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04271     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04272   } else {
04273     // Shift the first operand left to 32 bits.
04274     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04275     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04276   }
04277   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04278   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04279 }
04280 
04281 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04282 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04283 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04284                          bool HasInt256, bool V2IsSplat = false) {
04285 
04286   assert(VT.getSizeInBits() >= 128 &&
04287          "Unsupported vector type for unpckl");
04288 
04289   unsigned NumElts = VT.getVectorNumElements();
04290   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04291       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04292     return false;
04293 
04294   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04295          "Unsupported vector type for unpckh");
04296 
04297   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04298   unsigned NumLanes = VT.getSizeInBits()/128;
04299   unsigned NumLaneElts = NumElts/NumLanes;
04300 
04301   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04302     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04303       int BitI  = Mask[l+i];
04304       int BitI1 = Mask[l+i+1];
04305       if (!isUndefOrEqual(BitI, j))
04306         return false;
04307       if (V2IsSplat) {
04308         if (!isUndefOrEqual(BitI1, NumElts))
04309           return false;
04310       } else {
04311         if (!isUndefOrEqual(BitI1, j + NumElts))
04312           return false;
04313       }
04314     }
04315   }
04316 
04317   return true;
04318 }
04319 
04320 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04321 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04322 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04323                          bool HasInt256, bool V2IsSplat = false) {
04324   assert(VT.getSizeInBits() >= 128 &&
04325          "Unsupported vector type for unpckh");
04326 
04327   unsigned NumElts = VT.getVectorNumElements();
04328   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04329       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04330     return false;
04331 
04332   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04333          "Unsupported vector type for unpckh");
04334 
04335   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04336   unsigned NumLanes = VT.getSizeInBits()/128;
04337   unsigned NumLaneElts = NumElts/NumLanes;
04338 
04339   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04340     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04341       int BitI  = Mask[l+i];
04342       int BitI1 = Mask[l+i+1];
04343       if (!isUndefOrEqual(BitI, j))
04344         return false;
04345       if (V2IsSplat) {
04346         if (isUndefOrEqual(BitI1, NumElts))
04347           return false;
04348       } else {
04349         if (!isUndefOrEqual(BitI1, j+NumElts))
04350           return false;
04351       }
04352     }
04353   }
04354   return true;
04355 }
04356 
04357 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04358 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04359 /// <0, 0, 1, 1>
04360 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04361   unsigned NumElts = VT.getVectorNumElements();
04362   bool Is256BitVec = VT.is256BitVector();
04363 
04364   if (VT.is512BitVector())
04365     return false;
04366   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04367          "Unsupported vector type for unpckh");
04368 
04369   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04370       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04371     return false;
04372 
04373   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04374   // FIXME: Need a better way to get rid of this, there's no latency difference
04375   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04376   // the former later. We should also remove the "_undef" special mask.
04377   if (NumElts == 4 && Is256BitVec)
04378     return false;
04379 
04380   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04381   // independently on 128-bit lanes.
04382   unsigned NumLanes = VT.getSizeInBits()/128;
04383   unsigned NumLaneElts = NumElts/NumLanes;
04384 
04385   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04386     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04387       int BitI  = Mask[l+i];
04388       int BitI1 = Mask[l+i+1];
04389 
04390       if (!isUndefOrEqual(BitI, j))
04391         return false;
04392       if (!isUndefOrEqual(BitI1, j))
04393         return false;
04394     }
04395   }
04396 
04397   return true;
04398 }
04399 
04400 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04401 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04402 /// <2, 2, 3, 3>
04403 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04404   unsigned NumElts = VT.getVectorNumElements();
04405 
04406   if (VT.is512BitVector())
04407     return false;
04408 
04409   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04410          "Unsupported vector type for unpckh");
04411 
04412   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04413       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04414     return false;
04415 
04416   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04417   // independently on 128-bit lanes.
04418   unsigned NumLanes = VT.getSizeInBits()/128;
04419   unsigned NumLaneElts = NumElts/NumLanes;
04420 
04421   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04422     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04423       int BitI  = Mask[l+i];
04424       int BitI1 = Mask[l+i+1];
04425       if (!isUndefOrEqual(BitI, j))
04426         return false;
04427       if (!isUndefOrEqual(BitI1, j))
04428         return false;
04429     }
04430   }
04431   return true;
04432 }
04433 
04434 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04435 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04436 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04437   if (!VT.is512BitVector())
04438     return false;
04439 
04440   unsigned NumElts = VT.getVectorNumElements();
04441   unsigned HalfSize = NumElts/2;
04442   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04443     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04444       *Imm = 1;
04445       return true;
04446     }
04447   }
04448   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04449     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04450       *Imm = 0;
04451       return true;
04452     }
04453   }
04454   return false;
04455 }
04456 
04457 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04458 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04459 /// MOVSD, and MOVD, i.e. setting the lowest element.
04460 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04461   if (VT.getVectorElementType().getSizeInBits() < 32)
04462     return false;
04463   if (!VT.is128BitVector())
04464     return false;
04465 
04466   unsigned NumElts = VT.getVectorNumElements();
04467 
04468   if (!isUndefOrEqual(Mask[0], NumElts))
04469     return false;
04470 
04471   for (unsigned i = 1; i != NumElts; ++i)
04472     if (!isUndefOrEqual(Mask[i], i))
04473       return false;
04474 
04475   return true;
04476 }
04477 
04478 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04479 /// as permutations between 128-bit chunks or halves. As an example: this
04480 /// shuffle bellow:
04481 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04482 /// The first half comes from the second half of V1 and the second half from the
04483 /// the second half of V2.
04484 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04485   if (!HasFp256 || !VT.is256BitVector())
04486     return false;
04487 
04488   // The shuffle result is divided into half A and half B. In total the two
04489   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04490   // B must come from C, D, E or F.
04491   unsigned HalfSize = VT.getVectorNumElements()/2;
04492   bool MatchA = false, MatchB = false;
04493 
04494   // Check if A comes from one of C, D, E, F.
04495   for (unsigned Half = 0; Half != 4; ++Half) {
04496     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04497       MatchA = true;
04498       break;
04499     }
04500   }
04501 
04502   // Check if B comes from one of C, D, E, F.
04503   for (unsigned Half = 0; Half != 4; ++Half) {
04504     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04505       MatchB = true;
04506       break;
04507     }
04508   }
04509 
04510   return MatchA && MatchB;
04511 }
04512 
04513 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04514 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04515 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04516   MVT VT = SVOp->getSimpleValueType(0);
04517 
04518   unsigned HalfSize = VT.getVectorNumElements()/2;
04519 
04520   unsigned FstHalf = 0, SndHalf = 0;
04521   for (unsigned i = 0; i < HalfSize; ++i) {
04522     if (SVOp->getMaskElt(i) > 0) {
04523       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04524       break;
04525     }
04526   }
04527   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04528     if (SVOp->getMaskElt(i) > 0) {
04529       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04530       break;
04531     }
04532   }
04533 
04534   return (FstHalf | (SndHalf << 4));
04535 }
04536 
04537 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04538 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04539   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04540   if (EltSize < 32)
04541     return false;
04542 
04543   unsigned NumElts = VT.getVectorNumElements();
04544   Imm8 = 0;
04545   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04546     for (unsigned i = 0; i != NumElts; ++i) {
04547       if (Mask[i] < 0)
04548         continue;
04549       Imm8 |= Mask[i] << (i*2);
04550     }
04551     return true;
04552   }
04553 
04554   unsigned LaneSize = 4;
04555   SmallVector<int, 4> MaskVal(LaneSize, -1);
04556 
04557   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04558     for (unsigned i = 0; i != LaneSize; ++i) {
04559       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04560         return false;
04561       if (Mask[i+l] < 0)
04562         continue;
04563       if (MaskVal[i] < 0) {
04564         MaskVal[i] = Mask[i+l] - l;
04565         Imm8 |= MaskVal[i] << (i*2);
04566         continue;
04567       }
04568       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04569         return false;
04570     }
04571   }
04572   return true;
04573 }
04574 
04575 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04576 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04577 /// Note that VPERMIL mask matching is different depending whether theunderlying
04578 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04579 /// to the same elements of the low, but to the higher half of the source.
04580 /// In VPERMILPD the two lanes could be shuffled independently of each other
04581 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04582 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04583   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04584   if (VT.getSizeInBits() < 256 || EltSize < 32)
04585     return false;
04586   bool symetricMaskRequired = (EltSize == 32);
04587   unsigned NumElts = VT.getVectorNumElements();
04588 
04589   unsigned NumLanes = VT.getSizeInBits()/128;
04590   unsigned LaneSize = NumElts/NumLanes;
04591   // 2 or 4 elements in one lane
04592 
04593   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04594   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04595     for (unsigned i = 0; i != LaneSize; ++i) {
04596       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04597         return false;
04598       if (symetricMaskRequired) {
04599         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04600           ExpectedMaskVal[i] = Mask[i+l] - l;
04601           continue;
04602         }
04603         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04604           return false;
04605       }
04606     }
04607   }
04608   return true;
04609 }
04610 
04611 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04612 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04613 /// element of vector 2 and the other elements to come from vector 1 in order.
04614 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04615                                bool V2IsSplat = false, bool V2IsUndef = false) {
04616   if (!VT.is128BitVector())
04617     return false;
04618 
04619   unsigned NumOps = VT.getVectorNumElements();
04620   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04621     return false;
04622 
04623   if (!isUndefOrEqual(Mask[0], 0))
04624     return false;
04625 
04626   for (unsigned i = 1; i != NumOps; ++i)
04627     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04628           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04629           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04630       return false;
04631 
04632   return true;
04633 }
04634 
04635 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04636 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04637 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04638 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04639                            const X86Subtarget *Subtarget) {
04640   if (!Subtarget->hasSSE3())
04641     return false;
04642 
04643   unsigned NumElems = VT.getVectorNumElements();
04644 
04645   if ((VT.is128BitVector() && NumElems != 4) ||
04646       (VT.is256BitVector() && NumElems != 8) ||
04647       (VT.is512BitVector() && NumElems != 16))
04648     return false;
04649 
04650   // "i+1" is the value the indexed mask element must have
04651   for (unsigned i = 0; i != NumElems; i += 2)
04652     if (!isUndefOrEqual(Mask[i], i+1) ||
04653         !isUndefOrEqual(Mask[i+1], i+1))
04654       return false;
04655 
04656   return true;
04657 }
04658 
04659 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04660 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04661 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04662 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04663                            const X86Subtarget *Subtarget) {
04664   if (!Subtarget->hasSSE3())
04665     return false;
04666 
04667   unsigned NumElems = VT.getVectorNumElements();
04668 
04669   if ((VT.is128BitVector() && NumElems != 4) ||
04670       (VT.is256BitVector() && NumElems != 8) ||
04671       (VT.is512BitVector() && NumElems != 16))
04672     return false;
04673 
04674   // "i" is the value the indexed mask element must have
04675   for (unsigned i = 0; i != NumElems; i += 2)
04676     if (!isUndefOrEqual(Mask[i], i) ||
04677         !isUndefOrEqual(Mask[i+1], i))
04678       return false;
04679 
04680   return true;
04681 }
04682 
04683 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04684 /// specifies a shuffle of elements that is suitable for input to 256-bit
04685 /// version of MOVDDUP.
04686 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04687   if (!HasFp256 || !VT.is256BitVector())
04688     return false;
04689 
04690   unsigned NumElts = VT.getVectorNumElements();
04691   if (NumElts != 4)
04692     return false;
04693 
04694   for (unsigned i = 0; i != NumElts/2; ++i)
04695     if (!isUndefOrEqual(Mask[i], 0))
04696       return false;
04697   for (unsigned i = NumElts/2; i != NumElts; ++i)
04698     if (!isUndefOrEqual(Mask[i], NumElts/2))
04699       return false;
04700   return true;
04701 }
04702 
04703 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04704 /// specifies a shuffle of elements that is suitable for input to 128-bit
04705 /// version of MOVDDUP.
04706 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04707   if (!VT.is128BitVector())
04708     return false;
04709 
04710   unsigned e = VT.getVectorNumElements() / 2;
04711   for (unsigned i = 0; i != e; ++i)
04712     if (!isUndefOrEqual(Mask[i], i))
04713       return false;
04714   for (unsigned i = 0; i != e; ++i)
04715     if (!isUndefOrEqual(Mask[e+i], i))
04716       return false;
04717   return true;
04718 }
04719 
04720 /// isVEXTRACTIndex - Return true if the specified
04721 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04722 /// suitable for instruction that extract 128 or 256 bit vectors
04723 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04724   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04725   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04726     return false;
04727 
04728   // The index should be aligned on a vecWidth-bit boundary.
04729   uint64_t Index =
04730     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04731 
04732   MVT VT = N->getSimpleValueType(0);
04733   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04734   bool Result = (Index * ElSize) % vecWidth == 0;
04735 
04736   return Result;
04737 }
04738 
04739 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04740 /// operand specifies a subvector insert that is suitable for input to
04741 /// insertion of 128 or 256-bit subvectors
04742 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04743   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04744   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04745     return false;
04746   // The index should be aligned on a vecWidth-bit boundary.
04747   uint64_t Index =
04748     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04749 
04750   MVT VT = N->getSimpleValueType(0);
04751   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04752   bool Result = (Index * ElSize) % vecWidth == 0;
04753 
04754   return Result;
04755 }
04756 
04757 bool X86::isVINSERT128Index(SDNode *N) {
04758   return isVINSERTIndex(N, 128);
04759 }
04760 
04761 bool X86::isVINSERT256Index(SDNode *N) {
04762   return isVINSERTIndex(N, 256);
04763 }
04764 
04765 bool X86::isVEXTRACT128Index(SDNode *N) {
04766   return isVEXTRACTIndex(N, 128);
04767 }
04768 
04769 bool X86::isVEXTRACT256Index(SDNode *N) {
04770   return isVEXTRACTIndex(N, 256);
04771 }
04772 
04773 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04774 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04775 /// Handles 128-bit and 256-bit.
04776 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04777   MVT VT = N->getSimpleValueType(0);
04778 
04779   assert((VT.getSizeInBits() >= 128) &&
04780          "Unsupported vector type for PSHUF/SHUFP");
04781 
04782   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04783   // independently on 128-bit lanes.
04784   unsigned NumElts = VT.getVectorNumElements();
04785   unsigned NumLanes = VT.getSizeInBits()/128;
04786   unsigned NumLaneElts = NumElts/NumLanes;
04787 
04788   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04789          "Only supports 2, 4 or 8 elements per lane");
04790 
04791   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04792   unsigned Mask = 0;
04793   for (unsigned i = 0; i != NumElts; ++i) {
04794     int Elt = N->getMaskElt(i);
04795     if (Elt < 0) continue;
04796     Elt &= NumLaneElts - 1;
04797     unsigned ShAmt = (i << Shift) % 8;
04798     Mask |= Elt << ShAmt;
04799   }
04800 
04801   return Mask;
04802 }
04803 
04804 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04805 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04806 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04807   MVT VT = N->getSimpleValueType(0);
04808 
04809   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04810          "Unsupported vector type for PSHUFHW");
04811 
04812   unsigned NumElts = VT.getVectorNumElements();
04813 
04814   unsigned Mask = 0;
04815   for (unsigned l = 0; l != NumElts; l += 8) {
04816     // 8 nodes per lane, but we only care about the last 4.
04817     for (unsigned i = 0; i < 4; ++i) {
04818       int Elt = N->getMaskElt(l+i+4);
04819       if (Elt < 0) continue;
04820       Elt &= 0x3; // only 2-bits.
04821       Mask |= Elt << (i * 2);
04822     }
04823   }
04824 
04825   return Mask;
04826 }
04827 
04828 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04829 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04830 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04831   MVT VT = N->getSimpleValueType(0);
04832 
04833   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04834          "Unsupported vector type for PSHUFHW");
04835 
04836   unsigned NumElts = VT.getVectorNumElements();
04837 
04838   unsigned Mask = 0;
04839   for (unsigned l = 0; l != NumElts; l += 8) {
04840     // 8 nodes per lane, but we only care about the first 4.
04841     for (unsigned i = 0; i < 4; ++i) {
04842       int Elt = N->getMaskElt(l+i);
04843       if (Elt < 0) continue;
04844       Elt &= 0x3; // only 2-bits
04845       Mask |= Elt << (i * 2);
04846     }
04847   }
04848 
04849   return Mask;
04850 }
04851 
04852 /// \brief Return the appropriate immediate to shuffle the specified
04853 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04854 /// VALIGN (if Interlane is true) instructions.
04855 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04856                                            bool InterLane) {
04857   MVT VT = SVOp->getSimpleValueType(0);
04858   unsigned EltSize = InterLane ? 1 :
04859     VT.getVectorElementType().getSizeInBits() >> 3;
04860 
04861   unsigned NumElts = VT.getVectorNumElements();
04862   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04863   unsigned NumLaneElts = NumElts/NumLanes;
04864 
04865   int Val = 0;
04866   unsigned i;
04867   for (i = 0; i != NumElts; ++i) {
04868     Val = SVOp->getMaskElt(i);
04869     if (Val >= 0)
04870       break;
04871   }
04872   if (Val >= (int)NumElts)
04873     Val -= NumElts - NumLaneElts;
04874 
04875   assert(Val - i > 0 && "PALIGNR imm should be positive");
04876   return (Val - i) * EltSize;
04877 }
04878 
04879 /// \brief Return the appropriate immediate to shuffle the specified
04880 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04881 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04882   return getShuffleAlignrImmediate(SVOp, false);
04883 }
04884 
04885 /// \brief Return the appropriate immediate to shuffle the specified
04886 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04887 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04888   return getShuffleAlignrImmediate(SVOp, true);
04889 }
04890 
04891 
04892 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04893   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04894   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04895     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04896 
04897   uint64_t Index =
04898     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04899 
04900   MVT VecVT = N->getOperand(0).getSimpleValueType();
04901   MVT ElVT = VecVT.getVectorElementType();
04902 
04903   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04904   return Index / NumElemsPerChunk;
04905 }
04906 
04907 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04908   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04909   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04910     llvm_unreachable("Illegal insert subvector for VINSERT");
04911 
04912   uint64_t Index =
04913     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04914 
04915   MVT VecVT = N->getSimpleValueType(0);
04916   MVT ElVT = VecVT.getVectorElementType();
04917 
04918   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04919   return Index / NumElemsPerChunk;
04920 }
04921 
04922 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04923 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04924 /// and VINSERTI128 instructions.
04925 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04926   return getExtractVEXTRACTImmediate(N, 128);
04927 }
04928 
04929 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04930 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04931 /// and VINSERTI64x4 instructions.
04932 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04933   return getExtractVEXTRACTImmediate(N, 256);
04934 }
04935 
04936 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04937 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04938 /// and VINSERTI128 instructions.
04939 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04940   return getInsertVINSERTImmediate(N, 128);
04941 }
04942 
04943 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04944 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04945 /// and VINSERTI64x4 instructions.
04946 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04947   return getInsertVINSERTImmediate(N, 256);
04948 }
04949 
04950 /// isZero - Returns true if Elt is a constant integer zero
04951 static bool isZero(SDValue V) {
04952   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04953   return C && C->isNullValue();
04954 }
04955 
04956 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04957 /// constant +0.0.
04958 bool X86::isZeroNode(SDValue Elt) {
04959   if (isZero(Elt))
04960     return true;
04961   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04962     return CFP->getValueAPF().isPosZero();
04963   return false;
04964 }
04965 
04966 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04967 /// match movhlps. The lower half elements should come from upper half of
04968 /// V1 (and in order), and the upper half elements should come from the upper
04969 /// half of V2 (and in order).
04970 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04971   if (!VT.is128BitVector())
04972     return false;
04973   if (VT.getVectorNumElements() != 4)
04974     return false;
04975   for (unsigned i = 0, e = 2; i != e; ++i)
04976     if (!isUndefOrEqual(Mask[i], i+2))
04977       return false;
04978   for (unsigned i = 2; i != 4; ++i)
04979     if (!isUndefOrEqual(Mask[i], i+4))
04980       return false;
04981   return true;
04982 }
04983 
04984 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04985 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04986 /// required.
04987 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04988   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04989     return false;
04990   N = N->getOperand(0).getNode();
04991   if (!ISD::isNON_EXTLoad(N))
04992     return false;
04993   if (LD)
04994     *LD = cast<LoadSDNode>(N);
04995   return true;
04996 }
04997 
04998 // Test whether the given value is a vector value which will be legalized
04999 // into a load.
05000 static bool WillBeConstantPoolLoad(SDNode *N) {
05001   if (N->getOpcode() != ISD::BUILD_VECTOR)
05002     return false;
05003 
05004   // Check for any non-constant elements.
05005   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
05006     switch (N->getOperand(i).getNode()->getOpcode()) {
05007     case ISD::UNDEF:
05008     case ISD::ConstantFP:
05009     case ISD::Constant:
05010       break;
05011     default:
05012       return false;
05013     }
05014 
05015   // Vectors of all-zeros and all-ones are materialized with special
05016   // instructions rather than being loaded.
05017   return !ISD::isBuildVectorAllZeros(N) &&
05018          !ISD::isBuildVectorAllOnes(N);
05019 }
05020 
05021 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
05022 /// match movlp{s|d}. The lower half elements should come from lower half of
05023 /// V1 (and in order), and the upper half elements should come from the upper
05024 /// half of V2 (and in order). And since V1 will become the source of the
05025 /// MOVLP, it must be either a vector load or a scalar load to vector.
05026 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05027                                ArrayRef<int> Mask, MVT VT) {
05028   if (!VT.is128BitVector())
05029     return false;
05030 
05031   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05032     return false;
05033   // Is V2 is a vector load, don't do this transformation. We will try to use
05034   // load folding shufps op.
05035   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05036     return false;
05037 
05038   unsigned NumElems = VT.getVectorNumElements();
05039 
05040   if (NumElems != 2 && NumElems != 4)
05041     return false;
05042   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05043     if (!isUndefOrEqual(Mask[i], i))
05044       return false;
05045   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05046     if (!isUndefOrEqual(Mask[i], i+NumElems))
05047       return false;
05048   return true;
05049 }
05050 
05051 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05052 /// to an zero vector.
05053 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05054 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05055   SDValue V1 = N->getOperand(0);
05056   SDValue V2 = N->getOperand(1);
05057   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05058   for (unsigned i = 0; i != NumElems; ++i) {
05059     int Idx = N->getMaskElt(i);
05060     if (Idx >= (int)NumElems) {
05061       unsigned Opc = V2.getOpcode();
05062       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05063         continue;
05064       if (Opc != ISD::BUILD_VECTOR ||
05065           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05066         return false;
05067     } else if (Idx >= 0) {
05068       unsigned Opc = V1.getOpcode();
05069       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05070         continue;
05071       if (Opc != ISD::BUILD_VECTOR ||
05072           !X86::isZeroNode(V1.getOperand(Idx)))
05073         return false;
05074     }
05075   }
05076   return true;
05077 }
05078 
05079 /// getZeroVector - Returns a vector of specified type with all zero elements.
05080 ///
05081 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05082                              SelectionDAG &DAG, SDLoc dl) {
05083   assert(VT.isVector() && "Expected a vector type");
05084 
05085   // Always build SSE zero vectors as <4 x i32> bitcasted
05086   // to their dest type. This ensures they get CSE'd.
05087   SDValue Vec;
05088   if (VT.is128BitVector()) {  // SSE
05089     if (Subtarget->hasSSE2()) {  // SSE2
05090       SDValue Cst = DAG.getConstant(0, MVT::i32);
05091       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05092     } else { // SSE1
05093       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05094       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05095     }
05096   } else if (VT.is256BitVector()) { // AVX
05097     if (Subtarget->hasInt256()) { // AVX2
05098       SDValue Cst = DAG.getConstant(0, MVT::i32);
05099       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05100       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05101     } else {
05102       // 256-bit logic and arithmetic instructions in AVX are all
05103       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05104       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05105       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05106       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05107     }
05108   } else if (VT.is512BitVector()) { // AVX-512
05109       SDValue Cst = DAG.getConstant(0, MVT::i32);
05110       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05111                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05112       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05113   } else if (VT.getScalarType() == MVT::i1) {
05114     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05115     SDValue Cst = DAG.getConstant(0, MVT::i1);
05116     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05117     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05118   } else
05119     llvm_unreachable("Unexpected vector type");
05120 
05121   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05122 }
05123 
05124 /// getOnesVector - Returns a vector of specified type with all bits set.
05125 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05126 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05127 /// Then bitcast to their original type, ensuring they get CSE'd.
05128 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05129                              SDLoc dl) {
05130   assert(VT.isVector() && "Expected a vector type");
05131 
05132   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
05133   SDValue Vec;
05134   if (VT.is256BitVector()) {
05135     if (HasInt256) { // AVX2
05136       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05137       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05138     } else { // AVX
05139       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05140       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05141     }
05142   } else if (VT.is128BitVector()) {
05143     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05144   } else
05145     llvm_unreachable("Unexpected vector type");
05146 
05147   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05148 }
05149 
05150 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05151 /// that point to V2 points to its first element.
05152 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05153   for (unsigned i = 0; i != NumElems; ++i) {
05154     if (Mask[i] > (int)NumElems) {
05155       Mask[i] = NumElems;
05156     }
05157   }
05158 }
05159 
05160 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05161 /// operation of specified width.
05162 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05163                        SDValue V2) {
05164   unsigned NumElems = VT.getVectorNumElements();
05165   SmallVector<int, 8> Mask;
05166   Mask.push_back(NumElems);
05167   for (unsigned i = 1; i != NumElems; ++i)
05168     Mask.push_back(i);
05169   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05170 }
05171 
05172 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05173 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05174                           SDValue V2) {
05175   unsigned NumElems = VT.getVectorNumElements();
05176   SmallVector<int, 8> Mask;
05177   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05178     Mask.push_back(i);
05179     Mask.push_back(i + NumElems);
05180   }
05181   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05182 }
05183 
05184 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05185 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05186                           SDValue V2) {
05187   unsigned NumElems = VT.getVectorNumElements();
05188   SmallVector<int, 8> Mask;
05189   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05190     Mask.push_back(i + Half);
05191     Mask.push_back(i + NumElems + Half);
05192   }
05193   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05194 }
05195 
05196 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05197 // a generic shuffle instruction because the target has no such instructions.
05198 // Generate shuffles which repeat i16 and i8 several times until they can be
05199 // represented by v4f32 and then be manipulated by target suported shuffles.
05200 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05201   MVT VT = V.getSimpleValueType();
05202   int NumElems = VT.getVectorNumElements();
05203   SDLoc dl(V);
05204 
05205   while (NumElems > 4) {
05206     if (EltNo < NumElems/2) {
05207       V = getUnpackl(DAG, dl, VT, V, V);
05208     } else {
05209       V = getUnpackh(DAG, dl, VT, V, V);
05210       EltNo -= NumElems/2;
05211     }
05212     NumElems >>= 1;
05213   }
05214   return V;
05215 }
05216 
05217 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05218 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05219   MVT VT = V.getSimpleValueType();
05220   SDLoc dl(V);
05221 
05222   if (VT.is128BitVector()) {
05223     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05224     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05225     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05226                              &SplatMask[0]);
05227   } else if (VT.is256BitVector()) {
05228     // To use VPERMILPS to splat scalars, the second half of indicies must
05229     // refer to the higher part, which is a duplication of the lower one,
05230     // because VPERMILPS can only handle in-lane permutations.
05231     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05232                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05233 
05234     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05235     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05236                              &SplatMask[0]);
05237   } else
05238     llvm_unreachable("Vector size not supported");
05239 
05240   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05241 }
05242 
05243 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05244 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05245   MVT SrcVT = SV->getSimpleValueType(0);
05246   SDValue V1 = SV->getOperand(0);
05247   SDLoc dl(SV);
05248 
05249   int EltNo = SV->getSplatIndex();
05250   int NumElems = SrcVT.getVectorNumElements();
05251   bool Is256BitVec = SrcVT.is256BitVector();
05252 
05253   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05254          "Unknown how to promote splat for type");
05255 
05256   // Extract the 128-bit part containing the splat element and update
05257   // the splat element index when it refers to the higher register.
05258   if (Is256BitVec) {
05259     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05260     if (EltNo >= NumElems/2)
05261       EltNo -= NumElems/2;
05262   }
05263 
05264   // All i16 and i8 vector types can't be used directly by a generic shuffle
05265   // instruction because the target has no such instruction. Generate shuffles
05266   // which repeat i16 and i8 several times until they fit in i32, and then can
05267   // be manipulated by target suported shuffles.
05268   MVT EltVT = SrcVT.getVectorElementType();
05269   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05270     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05271 
05272   // Recreate the 256-bit vector and place the same 128-bit vector
05273   // into the low and high part. This is necessary because we want
05274   // to use VPERM* to shuffle the vectors
05275   if (Is256BitVec) {
05276     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05277   }
05278 
05279   return getLegalSplat(DAG, V1, EltNo);
05280 }
05281 
05282 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05283 /// vector of zero or undef vector.  This produces a shuffle where the low
05284 /// element of V2 is swizzled into the zero/undef vector, landing at element
05285 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05286 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05287                                            bool IsZero,
05288                                            const X86Subtarget *Subtarget,
05289                                            SelectionDAG &DAG) {
05290   MVT VT = V2.getSimpleValueType();
05291   SDValue V1 = IsZero
05292     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05293   unsigned NumElems = VT.getVectorNumElements();
05294   SmallVector<int, 16> MaskVec;
05295   for (unsigned i = 0; i != NumElems; ++i)
05296     // If this is the insertion idx, put the low elt of V2 here.
05297     MaskVec.push_back(i == Idx ? NumElems : i);
05298   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05299 }
05300 
05301 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05302 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05303 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05304 /// shuffles which use a single input multiple times, and in those cases it will
05305 /// adjust the mask to only have indices within that single input.
05306 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05307                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05308   unsigned NumElems = VT.getVectorNumElements();
05309   SDValue ImmN;
05310 
05311   IsUnary = false;
05312   bool IsFakeUnary = false;
05313   switch(N->getOpcode()) {
05314   case X86ISD::BLENDI:
05315     ImmN = N->getOperand(N->getNumOperands()-1);
05316     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05317     break;
05318   case X86ISD::SHUFP:
05319     ImmN = N->getOperand(N->getNumOperands()-1);
05320     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05321     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05322     break;
05323   case X86ISD::UNPCKH:
05324     DecodeUNPCKHMask(VT, Mask);
05325     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05326     break;
05327   case X86ISD::UNPCKL:
05328     DecodeUNPCKLMask(VT, Mask);
05329     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05330     break;
05331   case X86ISD::MOVHLPS:
05332     DecodeMOVHLPSMask(NumElems, Mask);
05333     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05334     break;
05335   case X86ISD::MOVLHPS:
05336     DecodeMOVLHPSMask(NumElems, Mask);
05337     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05338     break;
05339   case X86ISD::PALIGNR:
05340     ImmN = N->getOperand(N->getNumOperands()-1);
05341     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05342     break;
05343   case X86ISD::PSHUFD:
05344   case X86ISD::VPERMILPI:
05345     ImmN = N->getOperand(N->getNumOperands()-1);
05346     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05347     IsUnary = true;
05348     break;
05349   case X86ISD::PSHUFHW:
05350     ImmN = N->getOperand(N->getNumOperands()-1);
05351     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05352     IsUnary = true;
05353     break;
05354   case X86ISD::PSHUFLW:
05355     ImmN = N->getOperand(N->getNumOperands()-1);
05356     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05357     IsUnary = true;
05358     break;
05359   case X86ISD::PSHUFB: {
05360     IsUnary = true;
05361     SDValue MaskNode = N->getOperand(1);
05362     while (MaskNode->getOpcode() == ISD::BITCAST)
05363       MaskNode = MaskNode->getOperand(0);
05364 
05365     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05366       // If we have a build-vector, then things are easy.
05367       EVT VT = MaskNode.getValueType();
05368       assert(VT.isVector() &&
05369              "Can't produce a non-vector with a build_vector!");
05370       if (!VT.isInteger())
05371         return false;
05372 
05373       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05374 
05375       SmallVector<uint64_t, 32> RawMask;
05376       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05377         SDValue Op = MaskNode->getOperand(i);
05378         if (Op->getOpcode() == ISD::UNDEF) {
05379           RawMask.push_back((uint64_t)SM_SentinelUndef);
05380           continue;
05381         }
05382         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
05383         if (!CN)
05384           return false;
05385         APInt MaskElement = CN->getAPIntValue();
05386 
05387         // We now have to decode the element which could be any integer size and
05388         // extract each byte of it.
05389         for (int j = 0; j < NumBytesPerElement; ++j) {
05390           // Note that this is x86 and so always little endian: the low byte is
05391           // the first byte of the mask.
05392           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05393           MaskElement = MaskElement.lshr(8);
05394         }
05395       }
05396       DecodePSHUFBMask(RawMask, Mask);
05397       break;
05398     }
05399 
05400     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05401     if (!MaskLoad)
05402       return false;
05403 
05404     SDValue Ptr = MaskLoad->getBasePtr();
05405     if (Ptr->getOpcode() == X86ISD::Wrapper)
05406       Ptr = Ptr->getOperand(0);
05407 
05408     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05409     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05410       return false;
05411 
05412     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
05413       // FIXME: Support AVX-512 here.
05414       Type *Ty = C->getType();
05415       if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
05416                                 Ty->getVectorNumElements() != 32))
05417         return false;
05418 
05419       DecodePSHUFBMask(C, Mask);
05420       break;
05421     }
05422 
05423     return false;
05424   }
05425   case X86ISD::VPERMI:
05426     ImmN = N->getOperand(N->getNumOperands()-1);
05427     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05428     IsUnary = true;
05429     break;
05430   case X86ISD::MOVSS:
05431   case X86ISD::MOVSD: {
05432     // The index 0 always comes from the first element of the second source,
05433     // this is why MOVSS and MOVSD are used in the first place. The other
05434     // elements come from the other positions of the first source vector
05435     Mask.push_back(NumElems);
05436     for (unsigned i = 1; i != NumElems; ++i) {
05437       Mask.push_back(i);
05438     }
05439     break;
05440   }
05441   case X86ISD::VPERM2X128:
05442     ImmN = N->getOperand(N->getNumOperands()-1);
05443     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05444     if (Mask.empty()) return false;
05445     break;
05446   case X86ISD::MOVSLDUP:
05447     DecodeMOVSLDUPMask(VT, Mask);
05448     break;
05449   case X86ISD::MOVSHDUP:
05450     DecodeMOVSHDUPMask(VT, Mask);
05451     break;
05452   case X86ISD::MOVDDUP:
05453   case X86ISD::MOVLHPD:
05454   case X86ISD::MOVLPD:
05455   case X86ISD::MOVLPS:
05456     // Not yet implemented
05457     return false;
05458   default: llvm_unreachable("unknown target shuffle node");
05459   }
05460 
05461   // If we have a fake unary shuffle, the shuffle mask is spread across two
05462   // inputs that are actually the same node. Re-map the mask to always point
05463   // into the first input.
05464   if (IsFakeUnary)
05465     for (int &M : Mask)
05466       if (M >= (int)Mask.size())
05467         M -= Mask.size();
05468 
05469   return true;
05470 }
05471 
05472 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05473 /// element of the result of the vector shuffle.
05474 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05475                                    unsigned Depth) {
05476   if (Depth == 6)
05477     return SDValue();  // Limit search depth.
05478 
05479   SDValue V = SDValue(N, 0);
05480   EVT VT = V.getValueType();
05481   unsigned Opcode = V.getOpcode();
05482 
05483   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05484   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05485     int Elt = SV->getMaskElt(Index);
05486 
05487     if (Elt < 0)
05488       return DAG.getUNDEF(VT.getVectorElementType());
05489 
05490     unsigned NumElems = VT.getVectorNumElements();
05491     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05492                                          : SV->getOperand(1);
05493     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05494   }
05495 
05496   // Recurse into target specific vector shuffles to find scalars.
05497   if (isTargetShuffle(Opcode)) {
05498     MVT ShufVT = V.getSimpleValueType();
05499     unsigned NumElems = ShufVT.getVectorNumElements();
05500     SmallVector<int, 16> ShuffleMask;
05501     bool IsUnary;
05502 
05503     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05504       return SDValue();
05505 
05506     int Elt = ShuffleMask[Index];
05507     if (Elt < 0)
05508       return DAG.getUNDEF(ShufVT.getVectorElementType());
05509 
05510     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05511                                          : N->getOperand(1);
05512     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05513                                Depth+1);
05514   }
05515 
05516   // Actual nodes that may contain scalar elements
05517   if (Opcode == ISD::BITCAST) {
05518     V = V.getOperand(0);
05519     EVT SrcVT = V.getValueType();
05520     unsigned NumElems = VT.getVectorNumElements();
05521 
05522     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05523       return SDValue();
05524   }
05525 
05526   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05527     return (Index == 0) ? V.getOperand(0)
05528                         : DAG.getUNDEF(VT.getVectorElementType());
05529 
05530   if (V.getOpcode() == ISD::BUILD_VECTOR)
05531     return V.getOperand(Index);
05532 
05533   return SDValue();
05534 }
05535 
05536 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05537 /// shuffle operation which come from a consecutively from a zero. The
05538 /// search can start in two different directions, from left or right.
05539 /// We count undefs as zeros until PreferredNum is reached.
05540 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05541                                          unsigned NumElems, bool ZerosFromLeft,
05542                                          SelectionDAG &DAG,
05543                                          unsigned PreferredNum = -1U) {
05544   unsigned NumZeros = 0;
05545   for (unsigned i = 0; i != NumElems; ++i) {
05546     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05547     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05548     if (!Elt.getNode())
05549       break;
05550 
05551     if (X86::isZeroNode(Elt))
05552       ++NumZeros;
05553     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05554       NumZeros = std::min(NumZeros + 1, PreferredNum);
05555     else
05556       break;
05557   }
05558 
05559   return NumZeros;
05560 }
05561 
05562 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05563 /// correspond consecutively to elements from one of the vector operands,
05564 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05565 static
05566 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05567                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05568                               unsigned NumElems, unsigned &OpNum) {
05569   bool SeenV1 = false;
05570   bool SeenV2 = false;
05571 
05572   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05573     int Idx = SVOp->getMaskElt(i);
05574     // Ignore undef indicies
05575     if (Idx < 0)
05576       continue;
05577 
05578     if (Idx < (int)NumElems)
05579       SeenV1 = true;
05580     else
05581       SeenV2 = true;
05582 
05583     // Only accept consecutive elements from the same vector
05584     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05585       return false;
05586   }
05587 
05588   OpNum = SeenV1 ? 0 : 1;
05589   return true;
05590 }
05591 
05592 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05593 /// logical left shift of a vector.
05594 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05595                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05596   unsigned NumElems =
05597     SVOp->getSimpleValueType(0).getVectorNumElements();
05598   unsigned NumZeros = getNumOfConsecutiveZeros(
05599       SVOp, NumElems, false /* check zeros from right */, DAG,
05600       SVOp->getMaskElt(0));
05601   unsigned OpSrc;
05602 
05603   if (!NumZeros)
05604     return false;
05605 
05606   // Considering the elements in the mask that are not consecutive zeros,
05607   // check if they consecutively come from only one of the source vectors.
05608   //
05609   //               V1 = {X, A, B, C}     0
05610   //                         \  \  \    /
05611   //   vector_shuffle V1, V2 <1, 2, 3, X>
05612   //
05613   if (!isShuffleMaskConsecutive(SVOp,
05614             0,                   // Mask Start Index
05615             NumElems-NumZeros,   // Mask End Index(exclusive)
05616             NumZeros,            // Where to start looking in the src vector
05617             NumElems,            // Number of elements in vector
05618             OpSrc))              // Which source operand ?
05619     return false;
05620 
05621   isLeft = false;
05622   ShAmt = NumZeros;
05623   ShVal = SVOp->getOperand(OpSrc);
05624   return true;
05625 }
05626 
05627 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05628 /// logical left shift of a vector.
05629 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05630                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05631   unsigned NumElems =
05632     SVOp->getSimpleValueType(0).getVectorNumElements();
05633   unsigned NumZeros = getNumOfConsecutiveZeros(
05634       SVOp, NumElems, true /* check zeros from left */, DAG,
05635       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05636   unsigned OpSrc;
05637 
05638   if (!NumZeros)
05639     return false;
05640 
05641   // Considering the elements in the mask that are not consecutive zeros,
05642   // check if they consecutively come from only one of the source vectors.
05643   //
05644   //                           0    { A, B, X, X } = V2
05645   //                          / \    /  /
05646   //   vector_shuffle V1, V2 <X, X, 4, 5>
05647   //
05648   if (!isShuffleMaskConsecutive(SVOp,
05649             NumZeros,     // Mask Start Index
05650             NumElems,     // Mask End Index(exclusive)
05651             0,            // Where to start looking in the src vector
05652             NumElems,     // Number of elements in vector
05653             OpSrc))       // Which source operand ?
05654     return false;
05655 
05656   isLeft = true;
05657   ShAmt = NumZeros;
05658   ShVal = SVOp->getOperand(OpSrc);
05659   return true;
05660 }
05661 
05662 /// isVectorShift - Returns true if the shuffle can be implemented as a
05663 /// logical left or right shift of a vector.
05664 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05665                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05666   // Although the logic below support any bitwidth size, there are no
05667   // shift instructions which handle more than 128-bit vectors.
05668   if (!SVOp->getSimpleValueType(0).is128BitVector())
05669     return false;
05670 
05671   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05672       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05673     return true;
05674 
05675   return false;
05676 }
05677 
05678 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05679 ///
05680 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05681                                        unsigned NumNonZero, unsigned NumZero,
05682                                        SelectionDAG &DAG,
05683                                        const X86Subtarget* Subtarget,
05684                                        const TargetLowering &TLI) {
05685   if (NumNonZero > 8)
05686     return SDValue();
05687 
05688   SDLoc dl(Op);
05689   SDValue V;
05690   bool First = true;
05691   for (unsigned i = 0; i < 16; ++i) {
05692     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05693     if (ThisIsNonZero && First) {
05694       if (NumZero)
05695         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05696       else
05697         V = DAG.getUNDEF(MVT::v8i16);
05698       First = false;
05699     }
05700 
05701     if ((i & 1) != 0) {
05702       SDValue ThisElt, LastElt;
05703       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05704       if (LastIsNonZero) {
05705         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05706                               MVT::i16, Op.getOperand(i-1));
05707       }
05708       if (ThisIsNonZero) {
05709         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05710         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05711                               ThisElt, DAG.getConstant(8, MVT::i8));
05712         if (LastIsNonZero)
05713           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05714       } else
05715         ThisElt = LastElt;
05716 
05717       if (ThisElt.getNode())
05718         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05719                         DAG.getIntPtrConstant(i/2));
05720     }
05721   }
05722 
05723   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05724 }
05725 
05726 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05727 ///
05728 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05729                                      unsigned NumNonZero, unsigned NumZero,
05730                                      SelectionDAG &DAG,
05731                                      const X86Subtarget* Subtarget,
05732                                      const TargetLowering &TLI) {
05733   if (NumNonZero > 4)
05734     return SDValue();
05735 
05736   SDLoc dl(Op);
05737   SDValue V;
05738   bool First = true;
05739   for (unsigned i = 0; i < 8; ++i) {
05740     bool isNonZero = (NonZeros & (1 << i)) != 0;
05741     if (isNonZero) {
05742       if (First) {
05743         if (NumZero)
05744           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05745         else
05746           V = DAG.getUNDEF(MVT::v8i16);
05747         First = false;
05748       }
05749       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05750                       MVT::v8i16, V, Op.getOperand(i),
05751                       DAG.getIntPtrConstant(i));
05752     }
05753   }
05754 
05755   return V;
05756 }
05757 
05758 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05759 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
05760                                      const X86Subtarget *Subtarget,
05761                                      const TargetLowering &TLI) {
05762   // Find all zeroable elements.
05763   bool Zeroable[4];
05764   for (int i=0; i < 4; ++i) {
05765     SDValue Elt = Op->getOperand(i);
05766     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
05767   }
05768   assert(std::count_if(&Zeroable[0], &Zeroable[4],
05769                        [](bool M) { return !M; }) > 1 &&
05770          "We expect at least two non-zero elements!");
05771 
05772   // We only know how to deal with build_vector nodes where elements are either
05773   // zeroable or extract_vector_elt with constant index.
05774   SDValue FirstNonZero;
05775   unsigned FirstNonZeroIdx;
05776   for (unsigned i=0; i < 4; ++i) {
05777     if (Zeroable[i])
05778       continue;
05779     SDValue Elt = Op->getOperand(i);
05780     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05781         !isa<ConstantSDNode>(Elt.getOperand(1)))
05782       return SDValue();
05783     // Make sure that this node is extracting from a 128-bit vector.
05784     MVT VT = Elt.getOperand(0).getSimpleValueType();
05785     if (!VT.is128BitVector())
05786       return SDValue();
05787     if (!FirstNonZero.getNode()) {
05788       FirstNonZero = Elt;
05789       FirstNonZeroIdx = i;
05790     }
05791   }
05792 
05793   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
05794   SDValue V1 = FirstNonZero.getOperand(0);
05795   MVT VT = V1.getSimpleValueType();
05796 
05797   // See if this build_vector can be lowered as a blend with zero.
05798   SDValue Elt;
05799   unsigned EltMaskIdx, EltIdx;
05800   int Mask[4];
05801   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
05802     if (Zeroable[EltIdx]) {
05803       // The zero vector will be on the right hand side.
05804       Mask[EltIdx] = EltIdx+4;
05805       continue;
05806     }
05807 
05808     Elt = Op->getOperand(EltIdx);
05809     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
05810     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
05811     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
05812       break;
05813     Mask[EltIdx] = EltIdx;
05814   }
05815 
05816   if (EltIdx == 4) {
05817     // Let the shuffle legalizer deal with blend operations.
05818     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
05819     if (V1.getSimpleValueType() != VT)
05820       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
05821     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
05822   }
05823 
05824   // See if we can lower this build_vector to a INSERTPS.
05825   if (!Subtarget->hasSSE41())
05826     return SDValue();
05827 
05828   SDValue V2 = Elt.getOperand(0);
05829   if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
05830     V1 = SDValue();
05831 
05832   bool CanFold = true;
05833   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
05834     if (Zeroable[i])
05835       continue;
05836 
05837     SDValue Current = Op->getOperand(i);
05838     SDValue SrcVector = Current->getOperand(0);
05839     if (!V1.getNode())
05840       V1 = SrcVector;
05841     CanFold = SrcVector == V1 &&
05842       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
05843   }
05844 
05845   if (!CanFold)
05846     return SDValue();
05847 
05848   assert(V1.getNode() && "Expected at least two non-zero elements!");
05849   if (V1.getSimpleValueType() != MVT::v4f32)
05850     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
05851   if (V2.getSimpleValueType() != MVT::v4f32)
05852     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
05853 
05854   // Ok, we can emit an INSERTPS instruction.
05855   unsigned ZMask = 0;
05856   for (int i = 0; i < 4; ++i)
05857     if (Zeroable[i])
05858       ZMask |= 1 << i;
05859 
05860   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
05861   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
05862   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
05863                                DAG.getIntPtrConstant(InsertPSMask));
05864   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
05865 }
05866 
05867 /// getVShift - Return a vector logical shift node.
05868 ///
05869 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05870                          unsigned NumBits, SelectionDAG &DAG,
05871                          const TargetLowering &TLI, SDLoc dl) {
05872   assert(VT.is128BitVector() && "Unknown type for VShift");
05873   EVT ShVT = MVT::v2i64;
05874   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05875   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05876   return DAG.getNode(ISD::BITCAST, dl, VT,
05877                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05878                              DAG.getConstant(NumBits,
05879                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05880 }
05881 
05882 static SDValue
05883 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05884 
05885   // Check if the scalar load can be widened into a vector load. And if
05886   // the address is "base + cst" see if the cst can be "absorbed" into
05887   // the shuffle mask.
05888   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05889     SDValue Ptr = LD->getBasePtr();
05890     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05891       return SDValue();
05892     EVT PVT = LD->getValueType(0);
05893     if (PVT != MVT::i32 && PVT != MVT::f32)
05894       return SDValue();
05895 
05896     int FI = -1;
05897     int64_t Offset = 0;
05898     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05899       FI = FINode->getIndex();
05900       Offset = 0;
05901     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05902                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05903       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05904       Offset = Ptr.getConstantOperandVal(1);
05905       Ptr = Ptr.getOperand(0);
05906     } else {
05907       return SDValue();
05908     }
05909 
05910     // FIXME: 256-bit vector instructions don't require a strict alignment,
05911     // improve this code to support it better.
05912     unsigned RequiredAlign = VT.getSizeInBits()/8;
05913     SDValue Chain = LD->getChain();
05914     // Make sure the stack object alignment is at least 16 or 32.
05915     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05916     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05917       if (MFI->isFixedObjectIndex(FI)) {
05918         // Can't change the alignment. FIXME: It's possible to compute
05919         // the exact stack offset and reference FI + adjust offset instead.
05920         // If someone *really* cares about this. That's the way to implement it.
05921         return SDValue();
05922       } else {
05923         MFI->setObjectAlignment(FI, RequiredAlign);
05924       }
05925     }
05926 
05927     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05928     // Ptr + (Offset & ~15).
05929     if (Offset < 0)
05930       return SDValue();
05931     if ((Offset % RequiredAlign) & 3)
05932       return SDValue();
05933     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05934     if (StartOffset)
05935       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05936                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05937 
05938     int EltNo = (Offset - StartOffset) >> 2;
05939     unsigned NumElems = VT.getVectorNumElements();
05940 
05941     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05942     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05943                              LD->getPointerInfo().getWithOffset(StartOffset),
05944                              false, false, false, 0);
05945 
05946     SmallVector<int, 8> Mask;
05947     for (unsigned i = 0; i != NumElems; ++i)
05948       Mask.push_back(EltNo);
05949 
05950     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05951   }
05952 
05953   return SDValue();
05954 }
05955 
05956 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05957 /// vector of type 'VT', see if the elements can be replaced by a single large
05958 /// load which has the same value as a build_vector whose operands are 'elts'.
05959 ///
05960 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05961 ///
05962 /// FIXME: we'd also like to handle the case where the last elements are zero
05963 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05964 /// There's even a handy isZeroNode for that purpose.
05965 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05966                                         SDLoc &DL, SelectionDAG &DAG,
05967                                         bool isAfterLegalize) {
05968   EVT EltVT = VT.getVectorElementType();
05969   unsigned NumElems = Elts.size();
05970 
05971   LoadSDNode *LDBase = nullptr;
05972   unsigned LastLoadedElt = -1U;
05973 
05974   // For each element in the initializer, see if we've found a load or an undef.
05975   // If we don't find an initial load element, or later load elements are
05976   // non-consecutive, bail out.
05977   for (unsigned i = 0; i < NumElems; ++i) {
05978     SDValue Elt = Elts[i];
05979 
05980     if (!Elt.getNode() ||
05981         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05982       return SDValue();
05983     if (!LDBase) {
05984       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05985         return SDValue();
05986       LDBase = cast<LoadSDNode>(Elt.getNode());
05987       LastLoadedElt = i;
05988       continue;
05989     }
05990     if (Elt.getOpcode() == ISD::UNDEF)
05991       continue;
05992 
05993     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05994     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05995       return SDValue();
05996     LastLoadedElt = i;
05997   }
05998 
05999   // If we have found an entire vector of loads and undefs, then return a large
06000   // load of the entire vector width starting at the base pointer.  If we found
06001   // consecutive loads for the low half, generate a vzext_load node.
06002   if (LastLoadedElt == NumElems - 1) {
06003 
06004     if (isAfterLegalize &&
06005         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
06006       return SDValue();
06007 
06008     SDValue NewLd = SDValue();
06009 
06010     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
06011       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
06012                           LDBase->getPointerInfo(),
06013                           LDBase->isVolatile(), LDBase->isNonTemporal(),
06014                           LDBase->isInvariant(), 0);
06015     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
06016                         LDBase->getPointerInfo(),
06017                         LDBase->isVolatile(), LDBase->isNonTemporal(),
06018                         LDBase->isInvariant(), LDBase->getAlignment());
06019 
06020     if (LDBase->hasAnyUseOfValue(1)) {
06021       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06022                                      SDValue(LDBase, 1),
06023                                      SDValue(NewLd.getNode(), 1));
06024       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06025       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06026                              SDValue(NewLd.getNode(), 1));
06027     }
06028 
06029     return NewLd;
06030   }
06031   
06032   //TODO: The code below fires only for for loading the low v2i32 / v2f32
06033   //of a v4i32 / v4f32. It's probably worth generalizing.
06034   if (NumElems == 4 && LastLoadedElt == 1 && (EltVT.getSizeInBits() == 32) &&
06035       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
06036     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
06037     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
06038     SDValue ResNode =
06039         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
06040                                 LDBase->getPointerInfo(),
06041                                 LDBase->getAlignment(),
06042                                 false/*isVolatile*/, true/*ReadMem*/,
06043                                 false/*WriteMem*/);
06044 
06045     // Make sure the newly-created LOAD is in the same position as LDBase in
06046     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
06047     // update uses of LDBase's output chain to use the TokenFactor.
06048     if (LDBase->hasAnyUseOfValue(1)) {
06049       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06050                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
06051       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06052       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06053                              SDValue(ResNode.getNode(), 1));
06054     }
06055 
06056     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
06057   }
06058   return SDValue();
06059 }
06060 
06061 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
06062 /// to generate a splat value for the following cases:
06063 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
06064 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
06065 /// a scalar load, or a constant.
06066 /// The VBROADCAST node is returned when a pattern is found,
06067 /// or SDValue() otherwise.
06068 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
06069                                     SelectionDAG &DAG) {
06070   // VBROADCAST requires AVX.
06071   // TODO: Splats could be generated for non-AVX CPUs using SSE
06072   // instructions, but there's less potential gain for only 128-bit vectors.
06073   if (!Subtarget->hasAVX())
06074     return SDValue();
06075 
06076   MVT VT = Op.getSimpleValueType();
06077   SDLoc dl(Op);
06078 
06079   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06080          "Unsupported vector type for broadcast.");
06081 
06082   SDValue Ld;
06083   bool ConstSplatVal;
06084 
06085   switch (Op.getOpcode()) {
06086     default:
06087       // Unknown pattern found.
06088       return SDValue();
06089 
06090     case ISD::BUILD_VECTOR: {
06091       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06092       BitVector UndefElements;
06093       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06094 
06095       // We need a splat of a single value to use broadcast, and it doesn't
06096       // make any sense if the value is only in one element of the vector.
06097       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06098         return SDValue();
06099 
06100       Ld = Splat;
06101       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06102                        Ld.getOpcode() == ISD::ConstantFP);
06103 
06104       // Make sure that all of the users of a non-constant load are from the
06105       // BUILD_VECTOR node.
06106       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06107         return SDValue();
06108       break;
06109     }
06110 
06111     case ISD::VECTOR_SHUFFLE: {
06112       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06113 
06114       // Shuffles must have a splat mask where the first element is
06115       // broadcasted.
06116       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06117         return SDValue();
06118 
06119       SDValue Sc = Op.getOperand(0);
06120       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06121           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06122 
06123         if (!Subtarget->hasInt256())
06124           return SDValue();
06125 
06126         // Use the register form of the broadcast instruction available on AVX2.
06127         if (VT.getSizeInBits() >= 256)
06128           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06129         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06130       }
06131 
06132       Ld = Sc.getOperand(0);
06133       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06134                        Ld.getOpcode() == ISD::ConstantFP);
06135 
06136       // The scalar_to_vector node and the suspected
06137       // load node must have exactly one user.
06138       // Constants may have multiple users.
06139 
06140       // AVX-512 has register version of the broadcast
06141       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06142         Ld.getValueType().getSizeInBits() >= 32;
06143       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06144           !hasRegVer))
06145         return SDValue();
06146       break;
06147     }
06148   }
06149 
06150   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06151   bool IsGE256 = (VT.getSizeInBits() >= 256);
06152 
06153   // When optimizing for size, generate up to 5 extra bytes for a broadcast
06154   // instruction to save 8 or more bytes of constant pool data.
06155   // TODO: If multiple splats are generated to load the same constant,
06156   // it may be detrimental to overall size. There needs to be a way to detect
06157   // that condition to know if this is truly a size win.
06158   const Function *F = DAG.getMachineFunction().getFunction();
06159   bool OptForSize = F->getAttributes().
06160     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
06161 
06162   // Handle broadcasting a single constant scalar from the constant pool
06163   // into a vector.
06164   // On Sandybridge (no AVX2), it is still better to load a constant vector
06165   // from the constant pool and not to broadcast it from a scalar.
06166   // But override that restriction when optimizing for size.
06167   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
06168   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
06169     EVT CVT = Ld.getValueType();
06170     assert(!CVT.isVector() && "Must not broadcast a vector type");
06171 
06172     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
06173     // For size optimization, also splat v2f64 and v2i64, and for size opt
06174     // with AVX2, also splat i8 and i16.
06175     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
06176     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06177         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
06178       const Constant *C = nullptr;
06179       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06180         C = CI->getConstantIntValue();
06181       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06182         C = CF->getConstantFPValue();
06183 
06184       assert(C && "Invalid constant type");
06185 
06186       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06187       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06188       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06189       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06190                        MachinePointerInfo::getConstantPool(),
06191                        false, false, false, Alignment);
06192 
06193       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06194     }
06195   }
06196 
06197   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06198 
06199   // Handle AVX2 in-register broadcasts.
06200   if (!IsLoad && Subtarget->hasInt256() &&
06201       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06202     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06203 
06204   // The scalar source must be a normal load.
06205   if (!IsLoad)
06206     return SDValue();
06207 
06208   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06209       (Subtarget->hasVLX() && ScalarSize == 64))
06210     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06211 
06212   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06213   // double since there is no vbroadcastsd xmm
06214   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06215     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06216       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06217   }
06218 
06219   // Unsupported broadcast.
06220   return SDValue();
06221 }
06222 
06223 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06224 /// underlying vector and index.
06225 ///
06226 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06227 /// index.
06228 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06229                                          SDValue ExtIdx) {
06230   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06231   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06232     return Idx;
06233 
06234   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06235   // lowered this:
06236   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06237   // to:
06238   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06239   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06240   //                           undef)
06241   //                       Constant<0>)
06242   // In this case the vector is the extract_subvector expression and the index
06243   // is 2, as specified by the shuffle.
06244   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06245   SDValue ShuffleVec = SVOp->getOperand(0);
06246   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06247   assert(ShuffleVecVT.getVectorElementType() ==
06248          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06249 
06250   int ShuffleIdx = SVOp->getMaskElt(Idx);
06251   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06252     ExtractedFromVec = ShuffleVec;
06253     return ShuffleIdx;
06254   }
06255   return Idx;
06256 }
06257 
06258 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06259   MVT VT = Op.getSimpleValueType();
06260 
06261   // Skip if insert_vec_elt is not supported.
06262   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06263   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06264     return SDValue();
06265 
06266   SDLoc DL(Op);
06267   unsigned NumElems = Op.getNumOperands();
06268 
06269   SDValue VecIn1;
06270   SDValue VecIn2;
06271   SmallVector<unsigned, 4> InsertIndices;
06272   SmallVector<int, 8> Mask(NumElems, -1);
06273 
06274   for (unsigned i = 0; i != NumElems; ++i) {
06275     unsigned Opc = Op.getOperand(i).getOpcode();
06276 
06277     if (Opc == ISD::UNDEF)
06278       continue;
06279 
06280     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06281       // Quit if more than 1 elements need inserting.
06282       if (InsertIndices.size() > 1)
06283         return SDValue();
06284 
06285       InsertIndices.push_back(i);
06286       continue;
06287     }
06288 
06289     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06290     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06291     // Quit if non-constant index.
06292     if (!isa<ConstantSDNode>(ExtIdx))
06293       return SDValue();
06294     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06295 
06296     // Quit if extracted from vector of different type.
06297     if (ExtractedFromVec.getValueType() != VT)
06298       return SDValue();
06299 
06300     if (!VecIn1.getNode())
06301       VecIn1 = ExtractedFromVec;
06302     else if (VecIn1 != ExtractedFromVec) {
06303       if (!VecIn2.getNode())
06304         VecIn2 = ExtractedFromVec;
06305       else if (VecIn2 != ExtractedFromVec)
06306         // Quit if more than 2 vectors to shuffle
06307         return SDValue();
06308     }
06309 
06310     if (ExtractedFromVec == VecIn1)
06311       Mask[i] = Idx;
06312     else if (ExtractedFromVec == VecIn2)
06313       Mask[i] = Idx + NumElems;
06314   }
06315 
06316   if (!VecIn1.getNode())
06317     return SDValue();
06318 
06319   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06320   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06321   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06322     unsigned Idx = InsertIndices[i];
06323     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06324                      DAG.getIntPtrConstant(Idx));
06325   }
06326 
06327   return NV;
06328 }
06329 
06330 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06331 SDValue
06332 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06333 
06334   MVT VT = Op.getSimpleValueType();
06335   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06336          "Unexpected type in LowerBUILD_VECTORvXi1!");
06337 
06338   SDLoc dl(Op);
06339   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06340     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06341     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06342     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06343   }
06344 
06345   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06346     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06347     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06348     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06349   }
06350 
06351   bool AllContants = true;
06352   uint64_t Immediate = 0;
06353   int NonConstIdx = -1;
06354   bool IsSplat = true;
06355   unsigned NumNonConsts = 0;
06356   unsigned NumConsts = 0;
06357   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06358     SDValue In = Op.getOperand(idx);
06359     if (In.getOpcode() == ISD::UNDEF)
06360       continue;
06361     if (!isa<ConstantSDNode>(In)) {
06362       AllContants = false;
06363       NonConstIdx = idx;
06364       NumNonConsts++;
06365     } else {
06366       NumConsts++;
06367       if (cast<ConstantSDNode>(In)->getZExtValue())
06368       Immediate |= (1ULL << idx);
06369     }
06370     if (In != Op.getOperand(0))
06371       IsSplat = false;
06372   }
06373 
06374   if (AllContants) {
06375     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06376       DAG.getConstant(Immediate, MVT::i16));
06377     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06378                        DAG.getIntPtrConstant(0));
06379   }
06380 
06381   if (NumNonConsts == 1 && NonConstIdx != 0) {
06382     SDValue DstVec;
06383     if (NumConsts) {
06384       SDValue VecAsImm = DAG.getConstant(Immediate,
06385                                          MVT::getIntegerVT(VT.getSizeInBits()));
06386       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06387     }
06388     else
06389       DstVec = DAG.getUNDEF(VT);
06390     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06391                        Op.getOperand(NonConstIdx),
06392                        DAG.getIntPtrConstant(NonConstIdx));
06393   }
06394   if (!IsSplat && (NonConstIdx != 0))
06395     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06396   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06397   SDValue Select;
06398   if (IsSplat)
06399     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06400                           DAG.getConstant(-1, SelectVT),
06401                           DAG.getConstant(0, SelectVT));
06402   else
06403     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06404                          DAG.getConstant((Immediate | 1), SelectVT),
06405                          DAG.getConstant(Immediate, SelectVT));
06406   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06407 }
06408 
06409 /// \brief Return true if \p N implements a horizontal binop and return the
06410 /// operands for the horizontal binop into V0 and V1.
06411 ///
06412 /// This is a helper function of PerformBUILD_VECTORCombine.
06413 /// This function checks that the build_vector \p N in input implements a
06414 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06415 /// operation to match.
06416 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06417 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06418 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06419 /// arithmetic sub.
06420 ///
06421 /// This function only analyzes elements of \p N whose indices are
06422 /// in range [BaseIdx, LastIdx).
06423 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06424                               SelectionDAG &DAG,
06425                               unsigned BaseIdx, unsigned LastIdx,
06426                               SDValue &V0, SDValue &V1) {
06427   EVT VT = N->getValueType(0);
06428 
06429   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06430   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06431          "Invalid Vector in input!");
06432 
06433   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06434   bool CanFold = true;
06435   unsigned ExpectedVExtractIdx = BaseIdx;
06436   unsigned NumElts = LastIdx - BaseIdx;
06437   V0 = DAG.getUNDEF(VT);
06438   V1 = DAG.getUNDEF(VT);
06439 
06440   // Check if N implements a horizontal binop.
06441   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06442     SDValue Op = N->getOperand(i + BaseIdx);
06443 
06444     // Skip UNDEFs.
06445     if (Op->getOpcode() == ISD::UNDEF) {
06446       // Update the expected vector extract index.
06447       if (i * 2 == NumElts)
06448         ExpectedVExtractIdx = BaseIdx;
06449       ExpectedVExtractIdx += 2;
06450       continue;
06451     }
06452 
06453     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06454 
06455     if (!CanFold)
06456       break;
06457 
06458     SDValue Op0 = Op.getOperand(0);
06459     SDValue Op1 = Op.getOperand(1);
06460 
06461     // Try to match the following pattern:
06462     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06463     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06464         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06465         Op0.getOperand(0) == Op1.getOperand(0) &&
06466         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06467         isa<ConstantSDNode>(Op1.getOperand(1)));
06468     if (!CanFold)
06469       break;
06470 
06471     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06472     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06473 
06474     if (i * 2 < NumElts) {
06475       if (V0.getOpcode() == ISD::UNDEF)
06476         V0 = Op0.getOperand(0);
06477     } else {
06478       if (V1.getOpcode() == ISD::UNDEF)
06479         V1 = Op0.getOperand(0);
06480       if (i * 2 == NumElts)
06481         ExpectedVExtractIdx = BaseIdx;
06482     }
06483 
06484     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06485     if (I0 == ExpectedVExtractIdx)
06486       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06487     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06488       // Try to match the following dag sequence:
06489       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06490       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06491     } else
06492       CanFold = false;
06493 
06494     ExpectedVExtractIdx += 2;
06495   }
06496 
06497   return CanFold;
06498 }
06499 
06500 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06501 /// a concat_vector.
06502 ///
06503 /// This is a helper function of PerformBUILD_VECTORCombine.
06504 /// This function expects two 256-bit vectors called V0 and V1.
06505 /// At first, each vector is split into two separate 128-bit vectors.
06506 /// Then, the resulting 128-bit vectors are used to implement two
06507 /// horizontal binary operations.
06508 ///
06509 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06510 ///
06511 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06512 /// the two new horizontal binop.
06513 /// When Mode is set, the first horizontal binop dag node would take as input
06514 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06515 /// horizontal binop dag node would take as input the lower 128-bit of V1
06516 /// and the upper 128-bit of V1.
06517 ///   Example:
06518 ///     HADD V0_LO, V0_HI
06519 ///     HADD V1_LO, V1_HI
06520 ///
06521 /// Otherwise, the first horizontal binop dag node takes as input the lower
06522 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06523 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06524 ///   Example:
06525 ///     HADD V0_LO, V1_LO
06526 ///     HADD V0_HI, V1_HI
06527 ///
06528 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06529 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06530 /// the upper 128-bits of the result.
06531 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06532                                      SDLoc DL, SelectionDAG &DAG,
06533                                      unsigned X86Opcode, bool Mode,
06534                                      bool isUndefLO, bool isUndefHI) {
06535   EVT VT = V0.getValueType();
06536   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06537          "Invalid nodes in input!");
06538 
06539   unsigned NumElts = VT.getVectorNumElements();
06540   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06541   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06542   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06543   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06544   EVT NewVT = V0_LO.getValueType();
06545 
06546   SDValue LO = DAG.getUNDEF(NewVT);
06547   SDValue HI = DAG.getUNDEF(NewVT);
06548 
06549   if (Mode) {
06550     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06551     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06552       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06553     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06554       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06555   } else {
06556     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06557     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06558                        V1_LO->getOpcode() != ISD::UNDEF))
06559       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06560 
06561     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06562                        V1_HI->getOpcode() != ISD::UNDEF))
06563       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06564   }
06565 
06566   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06567 }
06568 
06569 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06570 /// sequence of 'vadd + vsub + blendi'.
06571 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06572                            const X86Subtarget *Subtarget) {
06573   SDLoc DL(BV);
06574   EVT VT = BV->getValueType(0);
06575   unsigned NumElts = VT.getVectorNumElements();
06576   SDValue InVec0 = DAG.getUNDEF(VT);
06577   SDValue InVec1 = DAG.getUNDEF(VT);
06578 
06579   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06580           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06581 
06582   // Odd-numbered elements in the input build vector are obtained from
06583   // adding two integer/float elements.
06584   // Even-numbered elements in the input build vector are obtained from
06585   // subtracting two integer/float elements.
06586   unsigned ExpectedOpcode = ISD::FSUB;
06587   unsigned NextExpectedOpcode = ISD::FADD;
06588   bool AddFound = false;
06589   bool SubFound = false;
06590 
06591   for (unsigned i = 0, e = NumElts; i != e; i++) {
06592     SDValue Op = BV->getOperand(i);
06593 
06594     // Skip 'undef' values.
06595     unsigned Opcode = Op.getOpcode();
06596     if (Opcode == ISD::UNDEF) {
06597       std::swap(ExpectedOpcode, NextExpectedOpcode);
06598       continue;
06599     }
06600 
06601     // Early exit if we found an unexpected opcode.
06602     if (Opcode != ExpectedOpcode)
06603       return SDValue();
06604 
06605     SDValue Op0 = Op.getOperand(0);
06606     SDValue Op1 = Op.getOperand(1);
06607 
06608     // Try to match the following pattern:
06609     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06610     // Early exit if we cannot match that sequence.
06611     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06612         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06613         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06614         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06615         Op0.getOperand(1) != Op1.getOperand(1))
06616       return SDValue();
06617 
06618     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06619     if (I0 != i)
06620       return SDValue();
06621 
06622     // We found a valid add/sub node. Update the information accordingly.
06623     if (i & 1)
06624       AddFound = true;
06625     else
06626       SubFound = true;
06627 
06628     // Update InVec0 and InVec1.
06629     if (InVec0.getOpcode() == ISD::UNDEF)
06630       InVec0 = Op0.getOperand(0);
06631     if (InVec1.getOpcode() == ISD::UNDEF)
06632       InVec1 = Op1.getOperand(0);
06633 
06634     // Make sure that operands in input to each add/sub node always
06635     // come from a same pair of vectors.
06636     if (InVec0 != Op0.getOperand(0)) {
06637       if (ExpectedOpcode == ISD::FSUB)
06638         return SDValue();
06639 
06640       // FADD is commutable. Try to commute the operands
06641       // and then test again.
06642       std::swap(Op0, Op1);
06643       if (InVec0 != Op0.getOperand(0))
06644         return SDValue();
06645     }
06646 
06647     if (InVec1 != Op1.getOperand(0))
06648       return SDValue();
06649 
06650     // Update the pair of expected opcodes.
06651     std::swap(ExpectedOpcode, NextExpectedOpcode);
06652   }
06653 
06654   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06655   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06656       InVec1.getOpcode() != ISD::UNDEF)
06657     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06658 
06659   return SDValue();
06660 }
06661 
06662 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06663                                           const X86Subtarget *Subtarget) {
06664   SDLoc DL(N);
06665   EVT VT = N->getValueType(0);
06666   unsigned NumElts = VT.getVectorNumElements();
06667   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06668   SDValue InVec0, InVec1;
06669 
06670   // Try to match an ADDSUB.
06671   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06672       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06673     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06674     if (Value.getNode())
06675       return Value;
06676   }
06677 
06678   // Try to match horizontal ADD/SUB.
06679   unsigned NumUndefsLO = 0;
06680   unsigned NumUndefsHI = 0;
06681   unsigned Half = NumElts/2;
06682 
06683   // Count the number of UNDEF operands in the build_vector in input.
06684   for (unsigned i = 0, e = Half; i != e; ++i)
06685     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06686       NumUndefsLO++;
06687 
06688   for (unsigned i = Half, e = NumElts; i != e; ++i)
06689     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06690       NumUndefsHI++;
06691 
06692   // Early exit if this is either a build_vector of all UNDEFs or all the
06693   // operands but one are UNDEF.
06694   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06695     return SDValue();
06696 
06697   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06698     // Try to match an SSE3 float HADD/HSUB.
06699     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06700       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06701 
06702     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06703       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06704   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06705     // Try to match an SSSE3 integer HADD/HSUB.
06706     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06707       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06708 
06709     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06710       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06711   }
06712 
06713   if (!Subtarget->hasAVX())
06714     return SDValue();
06715 
06716   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06717     // Try to match an AVX horizontal add/sub of packed single/double
06718     // precision floating point values from 256-bit vectors.
06719     SDValue InVec2, InVec3;
06720     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06721         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06722         ((InVec0.getOpcode() == ISD::UNDEF ||
06723           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06724         ((InVec1.getOpcode() == ISD::UNDEF ||
06725           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06726       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06727 
06728     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06729         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06730         ((InVec0.getOpcode() == ISD::UNDEF ||
06731           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06732         ((InVec1.getOpcode() == ISD::UNDEF ||
06733           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06734       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06735   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06736     // Try to match an AVX2 horizontal add/sub of signed integers.
06737     SDValue InVec2, InVec3;
06738     unsigned X86Opcode;
06739     bool CanFold = true;
06740 
06741     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06742         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06743         ((InVec0.getOpcode() == ISD::UNDEF ||
06744           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06745         ((InVec1.getOpcode() == ISD::UNDEF ||
06746           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06747       X86Opcode = X86ISD::HADD;
06748     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06749         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06750         ((InVec0.getOpcode() == ISD::UNDEF ||
06751           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06752         ((InVec1.getOpcode() == ISD::UNDEF ||
06753           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06754       X86Opcode = X86ISD::HSUB;
06755     else
06756       CanFold = false;
06757 
06758     if (CanFold) {
06759       // Fold this build_vector into a single horizontal add/sub.
06760       // Do this only if the target has AVX2.
06761       if (Subtarget->hasAVX2())
06762         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06763 
06764       // Do not try to expand this build_vector into a pair of horizontal
06765       // add/sub if we can emit a pair of scalar add/sub.
06766       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06767         return SDValue();
06768 
06769       // Convert this build_vector into a pair of horizontal binop followed by
06770       // a concat vector.
06771       bool isUndefLO = NumUndefsLO == Half;
06772       bool isUndefHI = NumUndefsHI == Half;
06773       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06774                                    isUndefLO, isUndefHI);
06775     }
06776   }
06777 
06778   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06779        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06780     unsigned X86Opcode;
06781     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06782       X86Opcode = X86ISD::HADD;
06783     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06784       X86Opcode = X86ISD::HSUB;
06785     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06786       X86Opcode = X86ISD::FHADD;
06787     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06788       X86Opcode = X86ISD::FHSUB;
06789     else
06790       return SDValue();
06791 
06792     // Don't try to expand this build_vector into a pair of horizontal add/sub
06793     // if we can simply emit a pair of scalar add/sub.
06794     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06795       return SDValue();
06796 
06797     // Convert this build_vector into two horizontal add/sub followed by
06798     // a concat vector.
06799     bool isUndefLO = NumUndefsLO == Half;
06800     bool isUndefHI = NumUndefsHI == Half;
06801     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06802                                  isUndefLO, isUndefHI);
06803   }
06804 
06805   return SDValue();
06806 }
06807 
06808 SDValue
06809 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06810   SDLoc dl(Op);
06811 
06812   MVT VT = Op.getSimpleValueType();
06813   MVT ExtVT = VT.getVectorElementType();
06814   unsigned NumElems = Op.getNumOperands();
06815 
06816   // Generate vectors for predicate vectors.
06817   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06818     return LowerBUILD_VECTORvXi1(Op, DAG);
06819 
06820   // Vectors containing all zeros can be matched by pxor and xorps later
06821   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06822     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06823     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06824     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06825       return Op;
06826 
06827     return getZeroVector(VT, Subtarget, DAG, dl);
06828   }
06829 
06830   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06831   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06832   // vpcmpeqd on 256-bit vectors.
06833   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06834     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06835       return Op;
06836 
06837     if (!VT.is512BitVector())
06838       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06839   }
06840 
06841   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06842   if (Broadcast.getNode())
06843     return Broadcast;
06844 
06845   unsigned EVTBits = ExtVT.getSizeInBits();
06846 
06847   unsigned NumZero  = 0;
06848   unsigned NumNonZero = 0;
06849   unsigned NonZeros = 0;
06850   bool IsAllConstants = true;
06851   SmallSet<SDValue, 8> Values;
06852   for (unsigned i = 0; i < NumElems; ++i) {
06853     SDValue Elt = Op.getOperand(i);
06854     if (Elt.getOpcode() == ISD::UNDEF)
06855       continue;
06856     Values.insert(Elt);
06857     if (Elt.getOpcode() != ISD::Constant &&
06858         Elt.getOpcode() != ISD::ConstantFP)
06859       IsAllConstants = false;
06860     if (X86::isZeroNode(Elt))
06861       NumZero++;
06862     else {
06863       NonZeros |= (1 << i);
06864       NumNonZero++;
06865     }
06866   }
06867 
06868   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06869   if (NumNonZero == 0)
06870     return DAG.getUNDEF(VT);
06871 
06872   // Special case for single non-zero, non-undef, element.
06873   if (NumNonZero == 1) {
06874     unsigned Idx = countTrailingZeros(NonZeros);
06875     SDValue Item = Op.getOperand(Idx);
06876 
06877     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06878     // the value are obviously zero, truncate the value to i32 and do the
06879     // insertion that way.  Only do this if the value is non-constant or if the
06880     // value is a constant being inserted into element 0.  It is cheaper to do
06881     // a constant pool load than it is to do a movd + shuffle.
06882     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06883         (!IsAllConstants || Idx == 0)) {
06884       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06885         // Handle SSE only.
06886         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06887         EVT VecVT = MVT::v4i32;
06888         unsigned VecElts = 4;
06889 
06890         // Truncate the value (which may itself be a constant) to i32, and
06891         // convert it to a vector with movd (S2V+shuffle to zero extend).
06892         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06893         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06894 
06895         // If using the new shuffle lowering, just directly insert this.
06896         if (ExperimentalVectorShuffleLowering)
06897           return DAG.getNode(
06898               ISD::BITCAST, dl, VT,
06899               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06900 
06901         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06902 
06903         // Now we have our 32-bit value zero extended in the low element of
06904         // a vector.  If Idx != 0, swizzle it into place.
06905         if (Idx != 0) {
06906           SmallVector<int, 4> Mask;
06907           Mask.push_back(Idx);
06908           for (unsigned i = 1; i != VecElts; ++i)
06909             Mask.push_back(i);
06910           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06911                                       &Mask[0]);
06912         }
06913         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06914       }
06915     }
06916 
06917     // If we have a constant or non-constant insertion into the low element of
06918     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06919     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06920     // depending on what the source datatype is.
06921     if (Idx == 0) {
06922       if (NumZero == 0)
06923         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06924 
06925       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06926           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06927         if (VT.is256BitVector() || VT.is512BitVector()) {
06928           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06929           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06930                              Item, DAG.getIntPtrConstant(0));
06931         }
06932         assert(VT.is128BitVector() && "Expected an SSE value type!");
06933         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06934         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06935         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06936       }
06937 
06938       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06939         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06940         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06941         if (VT.is256BitVector()) {
06942           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06943           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06944         } else {
06945           assert(VT.is128BitVector() && "Expected an SSE value type!");
06946           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06947         }
06948         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06949       }
06950     }
06951 
06952     // Is it a vector logical left shift?
06953     if (NumElems == 2 && Idx == 1 &&
06954         X86::isZeroNode(Op.getOperand(0)) &&
06955         !X86::isZeroNode(Op.getOperand(1))) {
06956       unsigned NumBits = VT.getSizeInBits();
06957       return getVShift(true, VT,
06958                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06959                                    VT, Op.getOperand(1)),
06960                        NumBits/2, DAG, *this, dl);
06961     }
06962 
06963     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06964       return SDValue();
06965 
06966     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06967     // is a non-constant being inserted into an element other than the low one,
06968     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06969     // movd/movss) to move this into the low element, then shuffle it into
06970     // place.
06971     if (EVTBits == 32) {
06972       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06973 
06974       // If using the new shuffle lowering, just directly insert this.
06975       if (ExperimentalVectorShuffleLowering)
06976         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
06977 
06978       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06979       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06980       SmallVector<int, 8> MaskVec;
06981       for (unsigned i = 0; i != NumElems; ++i)
06982         MaskVec.push_back(i == Idx ? 0 : 1);
06983       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06984     }
06985   }
06986 
06987   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06988   if (Values.size() == 1) {
06989     if (EVTBits == 32) {
06990       // Instead of a shuffle like this:
06991       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06992       // Check if it's possible to issue this instead.
06993       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06994       unsigned Idx = countTrailingZeros(NonZeros);
06995       SDValue Item = Op.getOperand(Idx);
06996       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06997         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06998     }
06999     return SDValue();
07000   }
07001 
07002   // A vector full of immediates; various special cases are already
07003   // handled, so this is best done with a single constant-pool load.
07004   if (IsAllConstants)
07005     return SDValue();
07006 
07007   // For AVX-length vectors, see if we can use a vector load to get all of the
07008   // elements, otherwise build the individual 128-bit pieces and use
07009   // shuffles to put them in place.
07010   if (VT.is256BitVector() || VT.is512BitVector()) {
07011     SmallVector<SDValue, 64> V;
07012     for (unsigned i = 0; i != NumElems; ++i)
07013       V.push_back(Op.getOperand(i));
07014 
07015     // Check for a build vector of consecutive loads.
07016     if (SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false))
07017       return LD;
07018     
07019     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
07020 
07021     // Build both the lower and upper subvector.
07022     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07023                                 makeArrayRef(&V[0], NumElems/2));
07024     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07025                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
07026 
07027     // Recreate the wider vector with the lower and upper part.
07028     if (VT.is256BitVector())
07029       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07030     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07031   }
07032 
07033   // Let legalizer expand 2-wide build_vectors.
07034   if (EVTBits == 64) {
07035     if (NumNonZero == 1) {
07036       // One half is zero or undef.
07037       unsigned Idx = countTrailingZeros(NonZeros);
07038       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
07039                                  Op.getOperand(Idx));
07040       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
07041     }
07042     return SDValue();
07043   }
07044 
07045   // If element VT is < 32 bits, convert it to inserts into a zero vector.
07046   if (EVTBits == 8 && NumElems == 16) {
07047     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
07048                                         Subtarget, *this);
07049     if (V.getNode()) return V;
07050   }
07051 
07052   if (EVTBits == 16 && NumElems == 8) {
07053     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
07054                                       Subtarget, *this);
07055     if (V.getNode()) return V;
07056   }
07057 
07058   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
07059   if (EVTBits == 32 && NumElems == 4) {
07060     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
07061     if (V.getNode())
07062       return V;
07063   }
07064 
07065   // If element VT is == 32 bits, turn it into a number of shuffles.
07066   SmallVector<SDValue, 8> V(NumElems);
07067   if (NumElems == 4 && NumZero > 0) {
07068     for (unsigned i = 0; i < 4; ++i) {
07069       bool isZero = !(NonZeros & (1 << i));
07070       if (isZero)
07071         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
07072       else
07073         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07074     }
07075 
07076     for (unsigned i = 0; i < 2; ++i) {
07077       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
07078         default: break;
07079         case 0:
07080           V[i] = V[i*2];  // Must be a zero vector.
07081           break;
07082         case 1:
07083           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
07084           break;
07085         case 2:
07086           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
07087           break;
07088         case 3:
07089           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
07090           break;
07091       }
07092     }
07093 
07094     bool Reverse1 = (NonZeros & 0x3) == 2;
07095     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
07096     int MaskVec[] = {
07097       Reverse1 ? 1 : 0,
07098       Reverse1 ? 0 : 1,
07099       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
07100       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
07101     };
07102     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
07103   }
07104 
07105   if (Values.size() > 1 && VT.is128BitVector()) {
07106     // Check for a build vector of consecutive loads.
07107     for (unsigned i = 0; i < NumElems; ++i)
07108       V[i] = Op.getOperand(i);
07109 
07110     // Check for elements which are consecutive loads.
07111     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
07112     if (LD.getNode())
07113       return LD;
07114 
07115     // Check for a build vector from mostly shuffle plus few inserting.
07116     SDValue Sh = buildFromShuffleMostly(Op, DAG);
07117     if (Sh.getNode())
07118       return Sh;
07119 
07120     // For SSE 4.1, use insertps to put the high elements into the low element.
07121     if (getSubtarget()->hasSSE41()) {
07122       SDValue Result;
07123       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
07124         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
07125       else
07126         Result = DAG.