LLVM API Documentation

X86ISelLowering.cpp
Go to the documentation of this file.
00001 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
00002 //
00003 //                     The LLVM Compiler Infrastructure
00004 //
00005 // This file is distributed under the University of Illinois Open Source
00006 // License. See LICENSE.TXT for details.
00007 //
00008 //===----------------------------------------------------------------------===//
00009 //
00010 // This file defines the interfaces that X86 uses to lower LLVM code into a
00011 // selection DAG.
00012 //
00013 //===----------------------------------------------------------------------===//
00014 
00015 #include "X86ISelLowering.h"
00016 #include "Utils/X86ShuffleDecode.h"
00017 #include "X86CallingConv.h"
00018 #include "X86InstrBuilder.h"
00019 #include "X86MachineFunctionInfo.h"
00020 #include "X86TargetMachine.h"
00021 #include "X86TargetObjectFile.h"
00022 #include "llvm/ADT/SmallBitVector.h"
00023 #include "llvm/ADT/SmallSet.h"
00024 #include "llvm/ADT/Statistic.h"
00025 #include "llvm/ADT/StringExtras.h"
00026 #include "llvm/ADT/StringSwitch.h"
00027 #include "llvm/ADT/VariadicFunction.h"
00028 #include "llvm/CodeGen/IntrinsicLowering.h"
00029 #include "llvm/CodeGen/MachineFrameInfo.h"
00030 #include "llvm/CodeGen/MachineFunction.h"
00031 #include "llvm/CodeGen/MachineInstrBuilder.h"
00032 #include "llvm/CodeGen/MachineJumpTableInfo.h"
00033 #include "llvm/CodeGen/MachineModuleInfo.h"
00034 #include "llvm/CodeGen/MachineRegisterInfo.h"
00035 #include "llvm/IR/CallSite.h"
00036 #include "llvm/IR/CallingConv.h"
00037 #include "llvm/IR/Constants.h"
00038 #include "llvm/IR/DerivedTypes.h"
00039 #include "llvm/IR/Function.h"
00040 #include "llvm/IR/GlobalAlias.h"
00041 #include "llvm/IR/GlobalVariable.h"
00042 #include "llvm/IR/Instructions.h"
00043 #include "llvm/IR/Intrinsics.h"
00044 #include "llvm/MC/MCAsmInfo.h"
00045 #include "llvm/MC/MCContext.h"
00046 #include "llvm/MC/MCExpr.h"
00047 #include "llvm/MC/MCSymbol.h"
00048 #include "llvm/Support/CommandLine.h"
00049 #include "llvm/Support/Debug.h"
00050 #include "llvm/Support/ErrorHandling.h"
00051 #include "llvm/Support/MathExtras.h"
00052 #include "llvm/Target/TargetOptions.h"
00053 #include "X86IntrinsicsInfo.h"
00054 #include <bitset>
00055 #include <numeric>
00056 #include <cctype>
00057 using namespace llvm;
00058 
00059 #define DEBUG_TYPE "x86-isel"
00060 
00061 STATISTIC(NumTailCalls, "Number of tail calls");
00062 
00063 static cl::opt<bool> ExperimentalVectorWideningLegalization(
00064     "x86-experimental-vector-widening-legalization", cl::init(false),
00065     cl::desc("Enable an experimental vector type legalization through widening "
00066              "rather than promotion."),
00067     cl::Hidden);
00068 
00069 static cl::opt<bool> ExperimentalVectorShuffleLowering(
00070     "x86-experimental-vector-shuffle-lowering", cl::init(true),
00071     cl::desc("Enable an experimental vector shuffle lowering code path."),
00072     cl::Hidden);
00073 
00074 static cl::opt<int> ReciprocalEstimateRefinementSteps(
00075     "x86-recip-refinement-steps", cl::init(1),
00076     cl::desc("Specify the number of Newton-Raphson iterations applied to the "
00077              "result of the hardware reciprocal estimate instruction."),
00078     cl::NotHidden);
00079 
00080 // Forward declarations.
00081 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
00082                        SDValue V2);
00083 
00084 static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
00085                                 SelectionDAG &DAG, SDLoc dl,
00086                                 unsigned vectorWidth) {
00087   assert((vectorWidth == 128 || vectorWidth == 256) &&
00088          "Unsupported vector width");
00089   EVT VT = Vec.getValueType();
00090   EVT ElVT = VT.getVectorElementType();
00091   unsigned Factor = VT.getSizeInBits()/vectorWidth;
00092   EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
00093                                   VT.getVectorNumElements()/Factor);
00094 
00095   // Extract from UNDEF is UNDEF.
00096   if (Vec.getOpcode() == ISD::UNDEF)
00097     return DAG.getUNDEF(ResultVT);
00098 
00099   // Extract the relevant vectorWidth bits.  Generate an EXTRACT_SUBVECTOR
00100   unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
00101 
00102   // This is the index of the first element of the vectorWidth-bit chunk
00103   // we want.
00104   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth)
00105                                * ElemsPerChunk);
00106 
00107   // If the input is a buildvector just emit a smaller one.
00108   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
00109     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
00110                        makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
00111                                     ElemsPerChunk));
00112 
00113   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00114   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
00115                                VecIdx);
00116 
00117   return Result;
00118 
00119 }
00120 /// Generate a DAG to grab 128-bits from a vector > 128 bits.  This
00121 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
00122 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
00123 /// instructions or a simple subregister reference. Idx is an index in the
00124 /// 128 bits we want.  It need not be aligned to a 128-bit bounday.  That makes
00125 /// lowering EXTRACT_VECTOR_ELT operations easier.
00126 static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
00127                                    SelectionDAG &DAG, SDLoc dl) {
00128   assert((Vec.getValueType().is256BitVector() ||
00129           Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
00130   return ExtractSubVector(Vec, IdxVal, DAG, dl, 128);
00131 }
00132 
00133 /// Generate a DAG to grab 256-bits from a 512-bit vector.
00134 static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal,
00135                                    SelectionDAG &DAG, SDLoc dl) {
00136   assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
00137   return ExtractSubVector(Vec, IdxVal, DAG, dl, 256);
00138 }
00139 
00140 static SDValue InsertSubVector(SDValue Result, SDValue Vec,
00141                                unsigned IdxVal, SelectionDAG &DAG,
00142                                SDLoc dl, unsigned vectorWidth) {
00143   assert((vectorWidth == 128 || vectorWidth == 256) &&
00144          "Unsupported vector width");
00145   // Inserting UNDEF is Result
00146   if (Vec.getOpcode() == ISD::UNDEF)
00147     return Result;
00148   EVT VT = Vec.getValueType();
00149   EVT ElVT = VT.getVectorElementType();
00150   EVT ResultVT = Result.getValueType();
00151 
00152   // Insert the relevant vectorWidth bits.
00153   unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
00154 
00155   // This is the index of the first element of the vectorWidth-bit chunk
00156   // we want.
00157   unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth)
00158                                * ElemsPerChunk);
00159 
00160   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
00161   return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec,
00162                      VecIdx);
00163 }
00164 /// Generate a DAG to put 128-bits into a vector > 128 bits.  This
00165 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
00166 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
00167 /// simple superregister reference.  Idx is an index in the 128 bits
00168 /// we want.  It need not be aligned to a 128-bit bounday.  That makes
00169 /// lowering INSERT_VECTOR_ELT operations easier.
00170 static SDValue Insert128BitVector(SDValue Result, SDValue Vec,
00171                                   unsigned IdxVal, SelectionDAG &DAG,
00172                                   SDLoc dl) {
00173   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
00174   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
00175 }
00176 
00177 static SDValue Insert256BitVector(SDValue Result, SDValue Vec,
00178                                   unsigned IdxVal, SelectionDAG &DAG,
00179                                   SDLoc dl) {
00180   assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
00181   return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
00182 }
00183 
00184 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
00185 /// instructions. This is used because creating CONCAT_VECTOR nodes of
00186 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
00187 /// large BUILD_VECTORS.
00188 static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
00189                                    unsigned NumElems, SelectionDAG &DAG,
00190                                    SDLoc dl) {
00191   SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00192   return Insert128BitVector(V, V2, NumElems/2, DAG, dl);
00193 }
00194 
00195 static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
00196                                    unsigned NumElems, SelectionDAG &DAG,
00197                                    SDLoc dl) {
00198   SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
00199   return Insert256BitVector(V, V2, NumElems/2, DAG, dl);
00200 }
00201 
00202 // FIXME: This should stop caching the target machine as soon as
00203 // we can remove resetOperationActions et al.
00204 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM)
00205     : TargetLowering(TM) {
00206   Subtarget = &TM.getSubtarget<X86Subtarget>();
00207   X86ScalarSSEf64 = Subtarget->hasSSE2();
00208   X86ScalarSSEf32 = Subtarget->hasSSE1();
00209   TD = getDataLayout();
00210 
00211   resetOperationActions();
00212 }
00213 
00214 void X86TargetLowering::resetOperationActions() {
00215   const TargetMachine &TM = getTargetMachine();
00216   static bool FirstTimeThrough = true;
00217 
00218   // If none of the target options have changed, then we don't need to reset the
00219   // operation actions.
00220   if (!FirstTimeThrough && TO == TM.Options) return;
00221 
00222   if (!FirstTimeThrough) {
00223     // Reinitialize the actions.
00224     initActions();
00225     FirstTimeThrough = false;
00226   }
00227 
00228   TO = TM.Options;
00229 
00230   // Set up the TargetLowering object.
00231   static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 };
00232 
00233   // X86 is weird, it always uses i8 for shift amounts and setcc results.
00234   setBooleanContents(ZeroOrOneBooleanContent);
00235   // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
00236   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
00237 
00238   // For 64-bit since we have so many registers use the ILP scheduler, for
00239   // 32-bit code use the register pressure specific scheduling.
00240   // For Atom, always use ILP scheduling.
00241   if (Subtarget->isAtom())
00242     setSchedulingPreference(Sched::ILP);
00243   else if (Subtarget->is64Bit())
00244     setSchedulingPreference(Sched::ILP);
00245   else
00246     setSchedulingPreference(Sched::RegPressure);
00247   const X86RegisterInfo *RegInfo =
00248       TM.getSubtarget<X86Subtarget>().getRegisterInfo();
00249   setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
00250 
00251   // Bypass expensive divides on Atom when compiling with O2
00252   if (TM.getOptLevel() >= CodeGenOpt::Default) {
00253     if (Subtarget->hasSlowDivide32()) 
00254       addBypassSlowDiv(32, 8);
00255     if (Subtarget->hasSlowDivide64() && Subtarget->is64Bit())
00256       addBypassSlowDiv(64, 16);
00257   }
00258 
00259   if (Subtarget->isTargetKnownWindowsMSVC()) {
00260     // Setup Windows compiler runtime calls.
00261     setLibcallName(RTLIB::SDIV_I64, "_alldiv");
00262     setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
00263     setLibcallName(RTLIB::SREM_I64, "_allrem");
00264     setLibcallName(RTLIB::UREM_I64, "_aullrem");
00265     setLibcallName(RTLIB::MUL_I64, "_allmul");
00266     setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
00267     setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
00268     setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
00269     setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
00270     setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
00271 
00272     // The _ftol2 runtime function has an unusual calling conv, which
00273     // is modeled by a special pseudo-instruction.
00274     setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
00275     setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
00276     setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
00277     setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
00278   }
00279 
00280   if (Subtarget->isTargetDarwin()) {
00281     // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
00282     setUseUnderscoreSetJmp(false);
00283     setUseUnderscoreLongJmp(false);
00284   } else if (Subtarget->isTargetWindowsGNU()) {
00285     // MS runtime is weird: it exports _setjmp, but longjmp!
00286     setUseUnderscoreSetJmp(true);
00287     setUseUnderscoreLongJmp(false);
00288   } else {
00289     setUseUnderscoreSetJmp(true);
00290     setUseUnderscoreLongJmp(true);
00291   }
00292 
00293   // Set up the register classes.
00294   addRegisterClass(MVT::i8, &X86::GR8RegClass);
00295   addRegisterClass(MVT::i16, &X86::GR16RegClass);
00296   addRegisterClass(MVT::i32, &X86::GR32RegClass);
00297   if (Subtarget->is64Bit())
00298     addRegisterClass(MVT::i64, &X86::GR64RegClass);
00299 
00300   setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
00301 
00302   // We don't accept any truncstore of integer registers.
00303   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
00304   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
00305   setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
00306   setTruncStoreAction(MVT::i32, MVT::i16, Expand);
00307   setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
00308   setTruncStoreAction(MVT::i16, MVT::i8,  Expand);
00309 
00310   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
00311 
00312   // SETOEQ and SETUNE require checking two conditions.
00313   setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
00314   setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
00315   setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
00316   setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
00317   setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
00318   setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
00319 
00320   // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
00321   // operation.
00322   setOperationAction(ISD::UINT_TO_FP       , MVT::i1   , Promote);
00323   setOperationAction(ISD::UINT_TO_FP       , MVT::i8   , Promote);
00324   setOperationAction(ISD::UINT_TO_FP       , MVT::i16  , Promote);
00325 
00326   if (Subtarget->is64Bit()) {
00327     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Promote);
00328     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00329   } else if (!TM.Options.UseSoftFloat) {
00330     // We have an algorithm for SSE2->double, and we turn this into a
00331     // 64-bit FILD followed by conditional FADD for other targets.
00332     setOperationAction(ISD::UINT_TO_FP     , MVT::i64  , Custom);
00333     // We have an algorithm for SSE2, and we turn this into a 64-bit
00334     // FILD for other targets.
00335     setOperationAction(ISD::UINT_TO_FP     , MVT::i32  , Custom);
00336   }
00337 
00338   // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
00339   // this operation.
00340   setOperationAction(ISD::SINT_TO_FP       , MVT::i1   , Promote);
00341   setOperationAction(ISD::SINT_TO_FP       , MVT::i8   , Promote);
00342 
00343   if (!TM.Options.UseSoftFloat) {
00344     // SSE has no i16 to fp conversion, only i32
00345     if (X86ScalarSSEf32) {
00346       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00347       // f32 and f64 cases are Legal, f80 case is not
00348       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00349     } else {
00350       setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Custom);
00351       setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Custom);
00352     }
00353   } else {
00354     setOperationAction(ISD::SINT_TO_FP     , MVT::i16  , Promote);
00355     setOperationAction(ISD::SINT_TO_FP     , MVT::i32  , Promote);
00356   }
00357 
00358   // In 32-bit mode these are custom lowered.  In 64-bit mode F32 and F64
00359   // are Legal, f80 is custom lowered.
00360   setOperationAction(ISD::FP_TO_SINT     , MVT::i64  , Custom);
00361   setOperationAction(ISD::SINT_TO_FP     , MVT::i64  , Custom);
00362 
00363   // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
00364   // this operation.
00365   setOperationAction(ISD::FP_TO_SINT       , MVT::i1   , Promote);
00366   setOperationAction(ISD::FP_TO_SINT       , MVT::i8   , Promote);
00367 
00368   if (X86ScalarSSEf32) {
00369     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Promote);
00370     // f32 and f64 cases are Legal, f80 case is not
00371     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00372   } else {
00373     setOperationAction(ISD::FP_TO_SINT     , MVT::i16  , Custom);
00374     setOperationAction(ISD::FP_TO_SINT     , MVT::i32  , Custom);
00375   }
00376 
00377   // Handle FP_TO_UINT by promoting the destination to a larger signed
00378   // conversion.
00379   setOperationAction(ISD::FP_TO_UINT       , MVT::i1   , Promote);
00380   setOperationAction(ISD::FP_TO_UINT       , MVT::i8   , Promote);
00381   setOperationAction(ISD::FP_TO_UINT       , MVT::i16  , Promote);
00382 
00383   if (Subtarget->is64Bit()) {
00384     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Expand);
00385     setOperationAction(ISD::FP_TO_UINT     , MVT::i32  , Promote);
00386   } else if (!TM.Options.UseSoftFloat) {
00387     // Since AVX is a superset of SSE3, only check for SSE here.
00388     if (Subtarget->hasSSE1() && !Subtarget->hasSSE3())
00389       // Expand FP_TO_UINT into a select.
00390       // FIXME: We would like to use a Custom expander here eventually to do
00391       // the optimal thing for SSE vs. the default expansion in the legalizer.
00392       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Expand);
00393     else
00394       // With SSE3 we can use fisttpll to convert to a signed i64; without
00395       // SSE, we're stuck with a fistpll.
00396       setOperationAction(ISD::FP_TO_UINT   , MVT::i32  , Custom);
00397   }
00398 
00399   if (isTargetFTOL()) {
00400     // Use the _ftol2 runtime function, which has a pseudo-instruction
00401     // to handle its weird calling convention.
00402     setOperationAction(ISD::FP_TO_UINT     , MVT::i64  , Custom);
00403   }
00404 
00405   // TODO: when we have SSE, these could be more efficient, by using movd/movq.
00406   if (!X86ScalarSSEf64) {
00407     setOperationAction(ISD::BITCAST        , MVT::f32  , Expand);
00408     setOperationAction(ISD::BITCAST        , MVT::i32  , Expand);
00409     if (Subtarget->is64Bit()) {
00410       setOperationAction(ISD::BITCAST      , MVT::f64  , Expand);
00411       // Without SSE, i64->f64 goes through memory.
00412       setOperationAction(ISD::BITCAST      , MVT::i64  , Expand);
00413     }
00414   }
00415 
00416   // Scalar integer divide and remainder are lowered to use operations that
00417   // produce two results, to match the available instructions. This exposes
00418   // the two-result form to trivial CSE, which is able to combine x/y and x%y
00419   // into a single instruction.
00420   //
00421   // Scalar integer multiply-high is also lowered to use two-result
00422   // operations, to match the available instructions. However, plain multiply
00423   // (low) operations are left as Legal, as there are single-result
00424   // instructions for this in x86. Using the two-result multiply instructions
00425   // when both high and low results are needed must be arranged by dagcombine.
00426   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00427     MVT VT = IntVTs[i];
00428     setOperationAction(ISD::MULHS, VT, Expand);
00429     setOperationAction(ISD::MULHU, VT, Expand);
00430     setOperationAction(ISD::SDIV, VT, Expand);
00431     setOperationAction(ISD::UDIV, VT, Expand);
00432     setOperationAction(ISD::SREM, VT, Expand);
00433     setOperationAction(ISD::UREM, VT, Expand);
00434 
00435     // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
00436     setOperationAction(ISD::ADDC, VT, Custom);
00437     setOperationAction(ISD::ADDE, VT, Custom);
00438     setOperationAction(ISD::SUBC, VT, Custom);
00439     setOperationAction(ISD::SUBE, VT, Custom);
00440   }
00441 
00442   setOperationAction(ISD::BR_JT            , MVT::Other, Expand);
00443   setOperationAction(ISD::BRCOND           , MVT::Other, Custom);
00444   setOperationAction(ISD::BR_CC            , MVT::f32,   Expand);
00445   setOperationAction(ISD::BR_CC            , MVT::f64,   Expand);
00446   setOperationAction(ISD::BR_CC            , MVT::f80,   Expand);
00447   setOperationAction(ISD::BR_CC            , MVT::i8,    Expand);
00448   setOperationAction(ISD::BR_CC            , MVT::i16,   Expand);
00449   setOperationAction(ISD::BR_CC            , MVT::i32,   Expand);
00450   setOperationAction(ISD::BR_CC            , MVT::i64,   Expand);
00451   setOperationAction(ISD::SELECT_CC        , MVT::f32,   Expand);
00452   setOperationAction(ISD::SELECT_CC        , MVT::f64,   Expand);
00453   setOperationAction(ISD::SELECT_CC        , MVT::f80,   Expand);
00454   setOperationAction(ISD::SELECT_CC        , MVT::i8,    Expand);
00455   setOperationAction(ISD::SELECT_CC        , MVT::i16,   Expand);
00456   setOperationAction(ISD::SELECT_CC        , MVT::i32,   Expand);
00457   setOperationAction(ISD::SELECT_CC        , MVT::i64,   Expand);
00458   if (Subtarget->is64Bit())
00459     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
00460   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16  , Legal);
00461   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8   , Legal);
00462   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1   , Expand);
00463   setOperationAction(ISD::FP_ROUND_INREG   , MVT::f32  , Expand);
00464   setOperationAction(ISD::FREM             , MVT::f32  , Expand);
00465   setOperationAction(ISD::FREM             , MVT::f64  , Expand);
00466   setOperationAction(ISD::FREM             , MVT::f80  , Expand);
00467   setOperationAction(ISD::FLT_ROUNDS_      , MVT::i32  , Custom);
00468 
00469   // Promote the i8 variants and force them on up to i32 which has a shorter
00470   // encoding.
00471   setOperationAction(ISD::CTTZ             , MVT::i8   , Promote);
00472   AddPromotedToType (ISD::CTTZ             , MVT::i8   , MVT::i32);
00473   setOperationAction(ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , Promote);
00474   AddPromotedToType (ISD::CTTZ_ZERO_UNDEF  , MVT::i8   , MVT::i32);
00475   if (Subtarget->hasBMI()) {
00476     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16  , Expand);
00477     setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32  , Expand);
00478     if (Subtarget->is64Bit())
00479       setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
00480   } else {
00481     setOperationAction(ISD::CTTZ           , MVT::i16  , Custom);
00482     setOperationAction(ISD::CTTZ           , MVT::i32  , Custom);
00483     if (Subtarget->is64Bit())
00484       setOperationAction(ISD::CTTZ         , MVT::i64  , Custom);
00485   }
00486 
00487   if (Subtarget->hasLZCNT()) {
00488     // When promoting the i8 variants, force them to i32 for a shorter
00489     // encoding.
00490     setOperationAction(ISD::CTLZ           , MVT::i8   , Promote);
00491     AddPromotedToType (ISD::CTLZ           , MVT::i8   , MVT::i32);
00492     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Promote);
00493     AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
00494     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Expand);
00495     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Expand);
00496     if (Subtarget->is64Bit())
00497       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
00498   } else {
00499     setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
00500     setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
00501     setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
00502     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
00503     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
00504     setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
00505     if (Subtarget->is64Bit()) {
00506       setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
00507       setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
00508     }
00509   }
00510 
00511   // Special handling for half-precision floating point conversions.
00512   // If we don't have F16C support, then lower half float conversions
00513   // into library calls.
00514   if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) {
00515     setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
00516     setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
00517   }
00518 
00519   // There's never any support for operations beyond MVT::f32.
00520   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
00521   setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
00522   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
00523   setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
00524 
00525   setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
00526   setTruncStoreAction(MVT::f32, MVT::f16, Expand);
00527   setTruncStoreAction(MVT::f64, MVT::f16, Expand);
00528   setTruncStoreAction(MVT::f80, MVT::f16, Expand);
00529 
00530   if (Subtarget->hasPOPCNT()) {
00531     setOperationAction(ISD::CTPOP          , MVT::i8   , Promote);
00532   } else {
00533     setOperationAction(ISD::CTPOP          , MVT::i8   , Expand);
00534     setOperationAction(ISD::CTPOP          , MVT::i16  , Expand);
00535     setOperationAction(ISD::CTPOP          , MVT::i32  , Expand);
00536     if (Subtarget->is64Bit())
00537       setOperationAction(ISD::CTPOP        , MVT::i64  , Expand);
00538   }
00539 
00540   setOperationAction(ISD::READCYCLECOUNTER , MVT::i64  , Custom);
00541 
00542   if (!Subtarget->hasMOVBE())
00543     setOperationAction(ISD::BSWAP          , MVT::i16  , Expand);
00544 
00545   // These should be promoted to a larger select which is supported.
00546   setOperationAction(ISD::SELECT          , MVT::i1   , Promote);
00547   // X86 wants to expand cmov itself.
00548   setOperationAction(ISD::SELECT          , MVT::i8   , Custom);
00549   setOperationAction(ISD::SELECT          , MVT::i16  , Custom);
00550   setOperationAction(ISD::SELECT          , MVT::i32  , Custom);
00551   setOperationAction(ISD::SELECT          , MVT::f32  , Custom);
00552   setOperationAction(ISD::SELECT          , MVT::f64  , Custom);
00553   setOperationAction(ISD::SELECT          , MVT::f80  , Custom);
00554   setOperationAction(ISD::SETCC           , MVT::i8   , Custom);
00555   setOperationAction(ISD::SETCC           , MVT::i16  , Custom);
00556   setOperationAction(ISD::SETCC           , MVT::i32  , Custom);
00557   setOperationAction(ISD::SETCC           , MVT::f32  , Custom);
00558   setOperationAction(ISD::SETCC           , MVT::f64  , Custom);
00559   setOperationAction(ISD::SETCC           , MVT::f80  , Custom);
00560   if (Subtarget->is64Bit()) {
00561     setOperationAction(ISD::SELECT        , MVT::i64  , Custom);
00562     setOperationAction(ISD::SETCC         , MVT::i64  , Custom);
00563   }
00564   setOperationAction(ISD::EH_RETURN       , MVT::Other, Custom);
00565   // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
00566   // SjLj exception handling but a light-weight setjmp/longjmp replacement to
00567   // support continuation, user-level threading, and etc.. As a result, no
00568   // other SjLj exception interfaces are implemented and please don't build
00569   // your own exception handling based on them.
00570   // LLVM/Clang supports zero-cost DWARF exception handling.
00571   setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
00572   setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
00573 
00574   // Darwin ABI issue.
00575   setOperationAction(ISD::ConstantPool    , MVT::i32  , Custom);
00576   setOperationAction(ISD::JumpTable       , MVT::i32  , Custom);
00577   setOperationAction(ISD::GlobalAddress   , MVT::i32  , Custom);
00578   setOperationAction(ISD::GlobalTLSAddress, MVT::i32  , Custom);
00579   if (Subtarget->is64Bit())
00580     setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
00581   setOperationAction(ISD::ExternalSymbol  , MVT::i32  , Custom);
00582   setOperationAction(ISD::BlockAddress    , MVT::i32  , Custom);
00583   if (Subtarget->is64Bit()) {
00584     setOperationAction(ISD::ConstantPool  , MVT::i64  , Custom);
00585     setOperationAction(ISD::JumpTable     , MVT::i64  , Custom);
00586     setOperationAction(ISD::GlobalAddress , MVT::i64  , Custom);
00587     setOperationAction(ISD::ExternalSymbol, MVT::i64  , Custom);
00588     setOperationAction(ISD::BlockAddress  , MVT::i64  , Custom);
00589   }
00590   // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
00591   setOperationAction(ISD::SHL_PARTS       , MVT::i32  , Custom);
00592   setOperationAction(ISD::SRA_PARTS       , MVT::i32  , Custom);
00593   setOperationAction(ISD::SRL_PARTS       , MVT::i32  , Custom);
00594   if (Subtarget->is64Bit()) {
00595     setOperationAction(ISD::SHL_PARTS     , MVT::i64  , Custom);
00596     setOperationAction(ISD::SRA_PARTS     , MVT::i64  , Custom);
00597     setOperationAction(ISD::SRL_PARTS     , MVT::i64  , Custom);
00598   }
00599 
00600   if (Subtarget->hasSSE1())
00601     setOperationAction(ISD::PREFETCH      , MVT::Other, Legal);
00602 
00603   setOperationAction(ISD::ATOMIC_FENCE  , MVT::Other, Custom);
00604 
00605   // Expand certain atomics
00606   for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) {
00607     MVT VT = IntVTs[i];
00608     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
00609     setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
00610     setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
00611   }
00612 
00613   if (Subtarget->hasCmpxchg16b()) {
00614     setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
00615   }
00616 
00617   // FIXME - use subtarget debug flags
00618   if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() &&
00619       !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) {
00620     setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
00621   }
00622 
00623   if (Subtarget->is64Bit()) {
00624     setExceptionPointerRegister(X86::RAX);
00625     setExceptionSelectorRegister(X86::RDX);
00626   } else {
00627     setExceptionPointerRegister(X86::EAX);
00628     setExceptionSelectorRegister(X86::EDX);
00629   }
00630   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
00631   setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
00632 
00633   setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
00634   setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
00635 
00636   setOperationAction(ISD::TRAP, MVT::Other, Legal);
00637   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
00638 
00639   // VASTART needs to be custom lowered to use the VarArgsFrameIndex
00640   setOperationAction(ISD::VASTART           , MVT::Other, Custom);
00641   setOperationAction(ISD::VAEND             , MVT::Other, Expand);
00642   if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) {
00643     // TargetInfo::X86_64ABIBuiltinVaList
00644     setOperationAction(ISD::VAARG           , MVT::Other, Custom);
00645     setOperationAction(ISD::VACOPY          , MVT::Other, Custom);
00646   } else {
00647     // TargetInfo::CharPtrBuiltinVaList
00648     setOperationAction(ISD::VAARG           , MVT::Other, Expand);
00649     setOperationAction(ISD::VACOPY          , MVT::Other, Expand);
00650   }
00651 
00652   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
00653   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
00654 
00655   setOperationAction(ISD::DYNAMIC_STACKALLOC, getPointerTy(), Custom);
00656 
00657   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
00658     // f32 and f64 use SSE.
00659     // Set up the FP register classes.
00660     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00661     addRegisterClass(MVT::f64, &X86::FR64RegClass);
00662 
00663     // Use ANDPD to simulate FABS.
00664     setOperationAction(ISD::FABS , MVT::f64, Custom);
00665     setOperationAction(ISD::FABS , MVT::f32, Custom);
00666 
00667     // Use XORP to simulate FNEG.
00668     setOperationAction(ISD::FNEG , MVT::f64, Custom);
00669     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00670 
00671     // Use ANDPD and ORPD to simulate FCOPYSIGN.
00672     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
00673     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00674 
00675     // Lower this to FGETSIGNx86 plus an AND.
00676     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
00677     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
00678 
00679     // We don't support sin/cos/fmod
00680     setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00681     setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00682     setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00683     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00684     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00685     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00686 
00687     // Expand FP immediates into loads from the stack, except for the special
00688     // cases we handle.
00689     addLegalFPImmediate(APFloat(+0.0)); // xorpd
00690     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00691   } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) {
00692     // Use SSE for f32, x87 for f64.
00693     // Set up the FP register classes.
00694     addRegisterClass(MVT::f32, &X86::FR32RegClass);
00695     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00696 
00697     // Use ANDPS to simulate FABS.
00698     setOperationAction(ISD::FABS , MVT::f32, Custom);
00699 
00700     // Use XORP to simulate FNEG.
00701     setOperationAction(ISD::FNEG , MVT::f32, Custom);
00702 
00703     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00704 
00705     // Use ANDPS and ORPS to simulate FCOPYSIGN.
00706     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00707     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
00708 
00709     // We don't support sin/cos/fmod
00710     setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00711     setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00712     setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00713 
00714     // Special cases we handle for FP constants.
00715     addLegalFPImmediate(APFloat(+0.0f)); // xorps
00716     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00717     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00718     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00719     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00720 
00721     if (!TM.Options.UnsafeFPMath) {
00722       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00723       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00724       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00725     }
00726   } else if (!TM.Options.UseSoftFloat) {
00727     // f32 and f64 in x87.
00728     // Set up the FP register classes.
00729     addRegisterClass(MVT::f64, &X86::RFP64RegClass);
00730     addRegisterClass(MVT::f32, &X86::RFP32RegClass);
00731 
00732     setOperationAction(ISD::UNDEF,     MVT::f64, Expand);
00733     setOperationAction(ISD::UNDEF,     MVT::f32, Expand);
00734     setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
00735     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
00736 
00737     if (!TM.Options.UnsafeFPMath) {
00738       setOperationAction(ISD::FSIN   , MVT::f64, Expand);
00739       setOperationAction(ISD::FSIN   , MVT::f32, Expand);
00740       setOperationAction(ISD::FCOS   , MVT::f64, Expand);
00741       setOperationAction(ISD::FCOS   , MVT::f32, Expand);
00742       setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
00743       setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
00744     }
00745     addLegalFPImmediate(APFloat(+0.0)); // FLD0
00746     addLegalFPImmediate(APFloat(+1.0)); // FLD1
00747     addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
00748     addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
00749     addLegalFPImmediate(APFloat(+0.0f)); // FLD0
00750     addLegalFPImmediate(APFloat(+1.0f)); // FLD1
00751     addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
00752     addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
00753   }
00754 
00755   // We don't support FMA.
00756   setOperationAction(ISD::FMA, MVT::f64, Expand);
00757   setOperationAction(ISD::FMA, MVT::f32, Expand);
00758 
00759   // Long double always uses X87.
00760   if (!TM.Options.UseSoftFloat) {
00761     addRegisterClass(MVT::f80, &X86::RFP80RegClass);
00762     setOperationAction(ISD::UNDEF,     MVT::f80, Expand);
00763     setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
00764     {
00765       APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended);
00766       addLegalFPImmediate(TmpFlt);  // FLD0
00767       TmpFlt.changeSign();
00768       addLegalFPImmediate(TmpFlt);  // FLD0/FCHS
00769 
00770       bool ignored;
00771       APFloat TmpFlt2(+1.0);
00772       TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven,
00773                       &ignored);
00774       addLegalFPImmediate(TmpFlt2);  // FLD1
00775       TmpFlt2.changeSign();
00776       addLegalFPImmediate(TmpFlt2);  // FLD1/FCHS
00777     }
00778 
00779     if (!TM.Options.UnsafeFPMath) {
00780       setOperationAction(ISD::FSIN   , MVT::f80, Expand);
00781       setOperationAction(ISD::FCOS   , MVT::f80, Expand);
00782       setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
00783     }
00784 
00785     setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
00786     setOperationAction(ISD::FCEIL,  MVT::f80, Expand);
00787     setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
00788     setOperationAction(ISD::FRINT,  MVT::f80, Expand);
00789     setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
00790     setOperationAction(ISD::FMA, MVT::f80, Expand);
00791   }
00792 
00793   // Always use a library call for pow.
00794   setOperationAction(ISD::FPOW             , MVT::f32  , Expand);
00795   setOperationAction(ISD::FPOW             , MVT::f64  , Expand);
00796   setOperationAction(ISD::FPOW             , MVT::f80  , Expand);
00797 
00798   setOperationAction(ISD::FLOG, MVT::f80, Expand);
00799   setOperationAction(ISD::FLOG2, MVT::f80, Expand);
00800   setOperationAction(ISD::FLOG10, MVT::f80, Expand);
00801   setOperationAction(ISD::FEXP, MVT::f80, Expand);
00802   setOperationAction(ISD::FEXP2, MVT::f80, Expand);
00803   setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
00804   setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
00805 
00806   // First set operation action for all vector types to either promote
00807   // (for widening) or expand (for scalarization). Then we will selectively
00808   // turn on ones that can be effectively codegen'd.
00809   for (int i = MVT::FIRST_VECTOR_VALUETYPE;
00810            i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
00811     MVT VT = (MVT::SimpleValueType)i;
00812     setOperationAction(ISD::ADD , VT, Expand);
00813     setOperationAction(ISD::SUB , VT, Expand);
00814     setOperationAction(ISD::FADD, VT, Expand);
00815     setOperationAction(ISD::FNEG, VT, Expand);
00816     setOperationAction(ISD::FSUB, VT, Expand);
00817     setOperationAction(ISD::MUL , VT, Expand);
00818     setOperationAction(ISD::FMUL, VT, Expand);
00819     setOperationAction(ISD::SDIV, VT, Expand);
00820     setOperationAction(ISD::UDIV, VT, Expand);
00821     setOperationAction(ISD::FDIV, VT, Expand);
00822     setOperationAction(ISD::SREM, VT, Expand);
00823     setOperationAction(ISD::UREM, VT, Expand);
00824     setOperationAction(ISD::LOAD, VT, Expand);
00825     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
00826     setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
00827     setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
00828     setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
00829     setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
00830     setOperationAction(ISD::FABS, VT, Expand);
00831     setOperationAction(ISD::FSIN, VT, Expand);
00832     setOperationAction(ISD::FSINCOS, VT, Expand);
00833     setOperationAction(ISD::FCOS, VT, Expand);
00834     setOperationAction(ISD::FSINCOS, VT, Expand);
00835     setOperationAction(ISD::FREM, VT, Expand);
00836     setOperationAction(ISD::FMA,  VT, Expand);
00837     setOperationAction(ISD::FPOWI, VT, Expand);
00838     setOperationAction(ISD::FSQRT, VT, Expand);
00839     setOperationAction(ISD::FCOPYSIGN, VT, Expand);
00840     setOperationAction(ISD::FFLOOR, VT, Expand);
00841     setOperationAction(ISD::FCEIL, VT, Expand);
00842     setOperationAction(ISD::FTRUNC, VT, Expand);
00843     setOperationAction(ISD::FRINT, VT, Expand);
00844     setOperationAction(ISD::FNEARBYINT, VT, Expand);
00845     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
00846     setOperationAction(ISD::MULHS, VT, Expand);
00847     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
00848     setOperationAction(ISD::MULHU, VT, Expand);
00849     setOperationAction(ISD::SDIVREM, VT, Expand);
00850     setOperationAction(ISD::UDIVREM, VT, Expand);
00851     setOperationAction(ISD::FPOW, VT, Expand);
00852     setOperationAction(ISD::CTPOP, VT, Expand);
00853     setOperationAction(ISD::CTTZ, VT, Expand);
00854     setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
00855     setOperationAction(ISD::CTLZ, VT, Expand);
00856     setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
00857     setOperationAction(ISD::SHL, VT, Expand);
00858     setOperationAction(ISD::SRA, VT, Expand);
00859     setOperationAction(ISD::SRL, VT, Expand);
00860     setOperationAction(ISD::ROTL, VT, Expand);
00861     setOperationAction(ISD::ROTR, VT, Expand);
00862     setOperationAction(ISD::BSWAP, VT, Expand);
00863     setOperationAction(ISD::SETCC, VT, Expand);
00864     setOperationAction(ISD::FLOG, VT, Expand);
00865     setOperationAction(ISD::FLOG2, VT, Expand);
00866     setOperationAction(ISD::FLOG10, VT, Expand);
00867     setOperationAction(ISD::FEXP, VT, Expand);
00868     setOperationAction(ISD::FEXP2, VT, Expand);
00869     setOperationAction(ISD::FP_TO_UINT, VT, Expand);
00870     setOperationAction(ISD::FP_TO_SINT, VT, Expand);
00871     setOperationAction(ISD::UINT_TO_FP, VT, Expand);
00872     setOperationAction(ISD::SINT_TO_FP, VT, Expand);
00873     setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
00874     setOperationAction(ISD::TRUNCATE, VT, Expand);
00875     setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
00876     setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
00877     setOperationAction(ISD::ANY_EXTEND, VT, Expand);
00878     setOperationAction(ISD::VSELECT, VT, Expand);
00879     setOperationAction(ISD::SELECT_CC, VT, Expand);
00880     for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE;
00881              InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
00882       setTruncStoreAction(VT,
00883                           (MVT::SimpleValueType)InnerVT, Expand);
00884     setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
00885     setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
00886 
00887     // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like types,
00888     // we have to deal with them whether we ask for Expansion or not. Setting
00889     // Expand causes its own optimisation problems though, so leave them legal.
00890     if (VT.getVectorElementType() == MVT::i1)
00891       setLoadExtAction(ISD::EXTLOAD, VT, Expand);
00892   }
00893 
00894   // FIXME: In order to prevent SSE instructions being expanded to MMX ones
00895   // with -msoft-float, disable use of MMX as well.
00896   if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) {
00897     addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
00898     // No operations on x86mmx supported, everything uses intrinsics.
00899   }
00900 
00901   // MMX-sized vectors (other than x86mmx) are expected to be expanded
00902   // into smaller operations.
00903   setOperationAction(ISD::MULHS,              MVT::v8i8,  Expand);
00904   setOperationAction(ISD::MULHS,              MVT::v4i16, Expand);
00905   setOperationAction(ISD::MULHS,              MVT::v2i32, Expand);
00906   setOperationAction(ISD::MULHS,              MVT::v1i64, Expand);
00907   setOperationAction(ISD::AND,                MVT::v8i8,  Expand);
00908   setOperationAction(ISD::AND,                MVT::v4i16, Expand);
00909   setOperationAction(ISD::AND,                MVT::v2i32, Expand);
00910   setOperationAction(ISD::AND,                MVT::v1i64, Expand);
00911   setOperationAction(ISD::OR,                 MVT::v8i8,  Expand);
00912   setOperationAction(ISD::OR,                 MVT::v4i16, Expand);
00913   setOperationAction(ISD::OR,                 MVT::v2i32, Expand);
00914   setOperationAction(ISD::OR,                 MVT::v1i64, Expand);
00915   setOperationAction(ISD::XOR,                MVT::v8i8,  Expand);
00916   setOperationAction(ISD::XOR,                MVT::v4i16, Expand);
00917   setOperationAction(ISD::XOR,                MVT::v2i32, Expand);
00918   setOperationAction(ISD::XOR,                MVT::v1i64, Expand);
00919   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i8,  Expand);
00920   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v4i16, Expand);
00921   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v2i32, Expand);
00922   setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v1i64, Expand);
00923   setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v1i64, Expand);
00924   setOperationAction(ISD::SELECT,             MVT::v8i8,  Expand);
00925   setOperationAction(ISD::SELECT,             MVT::v4i16, Expand);
00926   setOperationAction(ISD::SELECT,             MVT::v2i32, Expand);
00927   setOperationAction(ISD::SELECT,             MVT::v1i64, Expand);
00928   setOperationAction(ISD::BITCAST,            MVT::v8i8,  Expand);
00929   setOperationAction(ISD::BITCAST,            MVT::v4i16, Expand);
00930   setOperationAction(ISD::BITCAST,            MVT::v2i32, Expand);
00931   setOperationAction(ISD::BITCAST,            MVT::v1i64, Expand);
00932 
00933   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) {
00934     addRegisterClass(MVT::v4f32, &X86::VR128RegClass);
00935 
00936     setOperationAction(ISD::FADD,               MVT::v4f32, Legal);
00937     setOperationAction(ISD::FSUB,               MVT::v4f32, Legal);
00938     setOperationAction(ISD::FMUL,               MVT::v4f32, Legal);
00939     setOperationAction(ISD::FDIV,               MVT::v4f32, Legal);
00940     setOperationAction(ISD::FSQRT,              MVT::v4f32, Legal);
00941     setOperationAction(ISD::FNEG,               MVT::v4f32, Custom);
00942     setOperationAction(ISD::FABS,               MVT::v4f32, Custom);
00943     setOperationAction(ISD::LOAD,               MVT::v4f32, Legal);
00944     setOperationAction(ISD::BUILD_VECTOR,       MVT::v4f32, Custom);
00945     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v4f32, Custom);
00946     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
00947     setOperationAction(ISD::SELECT,             MVT::v4f32, Custom);
00948     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Custom);
00949   }
00950 
00951   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) {
00952     addRegisterClass(MVT::v2f64, &X86::VR128RegClass);
00953 
00954     // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
00955     // registers cannot be used even for integer operations.
00956     addRegisterClass(MVT::v16i8, &X86::VR128RegClass);
00957     addRegisterClass(MVT::v8i16, &X86::VR128RegClass);
00958     addRegisterClass(MVT::v4i32, &X86::VR128RegClass);
00959     addRegisterClass(MVT::v2i64, &X86::VR128RegClass);
00960 
00961     setOperationAction(ISD::ADD,                MVT::v16i8, Legal);
00962     setOperationAction(ISD::ADD,                MVT::v8i16, Legal);
00963     setOperationAction(ISD::ADD,                MVT::v4i32, Legal);
00964     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
00965     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
00966     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
00967     setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
00968     setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
00969     setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
00970     setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
00971     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
00972     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
00973     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
00974     setOperationAction(ISD::SUB,                MVT::v2i64, Legal);
00975     setOperationAction(ISD::MUL,                MVT::v8i16, Legal);
00976     setOperationAction(ISD::FADD,               MVT::v2f64, Legal);
00977     setOperationAction(ISD::FSUB,               MVT::v2f64, Legal);
00978     setOperationAction(ISD::FMUL,               MVT::v2f64, Legal);
00979     setOperationAction(ISD::FDIV,               MVT::v2f64, Legal);
00980     setOperationAction(ISD::FSQRT,              MVT::v2f64, Legal);
00981     setOperationAction(ISD::FNEG,               MVT::v2f64, Custom);
00982     setOperationAction(ISD::FABS,               MVT::v2f64, Custom);
00983 
00984     setOperationAction(ISD::SETCC,              MVT::v2i64, Custom);
00985     setOperationAction(ISD::SETCC,              MVT::v16i8, Custom);
00986     setOperationAction(ISD::SETCC,              MVT::v8i16, Custom);
00987     setOperationAction(ISD::SETCC,              MVT::v4i32, Custom);
00988 
00989     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v16i8, Custom);
00990     setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v8i16, Custom);
00991     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
00992     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
00993     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
00994 
00995     // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
00996     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
00997       MVT VT = (MVT::SimpleValueType)i;
00998       // Do not attempt to custom lower non-power-of-2 vectors
00999       if (!isPowerOf2_32(VT.getVectorNumElements()))
01000         continue;
01001       // Do not attempt to custom lower non-128-bit vectors
01002       if (!VT.is128BitVector())
01003         continue;
01004       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01005       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01006       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01007     }
01008 
01009     // We support custom legalizing of sext and anyext loads for specific
01010     // memory vector types which we can load as a scalar (or sequence of
01011     // scalars) and extend in-register to a legal 128-bit vector type. For sext
01012     // loads these must work with a single scalar load.
01013     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i8, Custom);
01014     setLoadExtAction(ISD::SEXTLOAD, MVT::v4i16, Custom);
01015     setLoadExtAction(ISD::SEXTLOAD, MVT::v8i8, Custom);
01016     setLoadExtAction(ISD::EXTLOAD, MVT::v2i8, Custom);
01017     setLoadExtAction(ISD::EXTLOAD, MVT::v2i16, Custom);
01018     setLoadExtAction(ISD::EXTLOAD, MVT::v2i32, Custom);
01019     setLoadExtAction(ISD::EXTLOAD, MVT::v4i8, Custom);
01020     setLoadExtAction(ISD::EXTLOAD, MVT::v4i16, Custom);
01021     setLoadExtAction(ISD::EXTLOAD, MVT::v8i8, Custom);
01022 
01023     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2f64, Custom);
01024     setOperationAction(ISD::BUILD_VECTOR,       MVT::v2i64, Custom);
01025     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2f64, Custom);
01026     setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v2i64, Custom);
01027     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2f64, Custom);
01028     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom);
01029 
01030     if (Subtarget->is64Bit()) {
01031       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01032       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01033     }
01034 
01035     // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
01036     for (int i = MVT::v16i8; i != MVT::v2i64; ++i) {
01037       MVT VT = (MVT::SimpleValueType)i;
01038 
01039       // Do not attempt to promote non-128-bit vectors
01040       if (!VT.is128BitVector())
01041         continue;
01042 
01043       setOperationAction(ISD::AND,    VT, Promote);
01044       AddPromotedToType (ISD::AND,    VT, MVT::v2i64);
01045       setOperationAction(ISD::OR,     VT, Promote);
01046       AddPromotedToType (ISD::OR,     VT, MVT::v2i64);
01047       setOperationAction(ISD::XOR,    VT, Promote);
01048       AddPromotedToType (ISD::XOR,    VT, MVT::v2i64);
01049       setOperationAction(ISD::LOAD,   VT, Promote);
01050       AddPromotedToType (ISD::LOAD,   VT, MVT::v2i64);
01051       setOperationAction(ISD::SELECT, VT, Promote);
01052       AddPromotedToType (ISD::SELECT, VT, MVT::v2i64);
01053     }
01054 
01055     // Custom lower v2i64 and v2f64 selects.
01056     setOperationAction(ISD::LOAD,               MVT::v2f64, Legal);
01057     setOperationAction(ISD::LOAD,               MVT::v2i64, Legal);
01058     setOperationAction(ISD::SELECT,             MVT::v2f64, Custom);
01059     setOperationAction(ISD::SELECT,             MVT::v2i64, Custom);
01060 
01061     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
01062     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
01063 
01064     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
01065     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
01066     // As there is no 64-bit GPR available, we need build a special custom
01067     // sequence to convert from v2i32 to v2f32.
01068     if (!Subtarget->is64Bit())
01069       setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
01070 
01071     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
01072     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
01073 
01074     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
01075 
01076     setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
01077     setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
01078     setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
01079   }
01080 
01081   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
01082     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
01083     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
01084     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
01085     setOperationAction(ISD::FRINT,              MVT::f32,   Legal);
01086     setOperationAction(ISD::FNEARBYINT,         MVT::f32,   Legal);
01087     setOperationAction(ISD::FFLOOR,             MVT::f64,   Legal);
01088     setOperationAction(ISD::FCEIL,              MVT::f64,   Legal);
01089     setOperationAction(ISD::FTRUNC,             MVT::f64,   Legal);
01090     setOperationAction(ISD::FRINT,              MVT::f64,   Legal);
01091     setOperationAction(ISD::FNEARBYINT,         MVT::f64,   Legal);
01092 
01093     setOperationAction(ISD::FFLOOR,             MVT::v4f32, Legal);
01094     setOperationAction(ISD::FCEIL,              MVT::v4f32, Legal);
01095     setOperationAction(ISD::FTRUNC,             MVT::v4f32, Legal);
01096     setOperationAction(ISD::FRINT,              MVT::v4f32, Legal);
01097     setOperationAction(ISD::FNEARBYINT,         MVT::v4f32, Legal);
01098     setOperationAction(ISD::FFLOOR,             MVT::v2f64, Legal);
01099     setOperationAction(ISD::FCEIL,              MVT::v2f64, Legal);
01100     setOperationAction(ISD::FTRUNC,             MVT::v2f64, Legal);
01101     setOperationAction(ISD::FRINT,              MVT::v2f64, Legal);
01102     setOperationAction(ISD::FNEARBYINT,         MVT::v2f64, Legal);
01103 
01104     // FIXME: Do we need to handle scalar-to-vector here?
01105     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
01106 
01107     setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
01108     setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
01109     setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
01110     setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
01111     setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
01112     // There is no BLENDI for byte vectors. We don't need to custom lower
01113     // some vselects for now.
01114     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
01115 
01116     // SSE41 brings specific instructions for doing vector sign extend even in
01117     // cases where we don't have SRA.
01118     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i8, Custom);
01119     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i16, Custom);
01120     setLoadExtAction(ISD::SEXTLOAD, MVT::v2i32, Custom);
01121 
01122     // i8 and i16 vectors are custom because the source register and source
01123     // source memory operand types are not the same width.  f32 vectors are
01124     // custom since the immediate controlling the insert encodes additional
01125     // information.
01126     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i8, Custom);
01127     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i16, Custom);
01128     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4i32, Custom);
01129     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v4f32, Custom);
01130 
01131     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom);
01132     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom);
01133     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
01134     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
01135 
01136     // FIXME: these should be Legal, but that's only for the case where
01137     // the index is constant.  For now custom expand to deal with that.
01138     if (Subtarget->is64Bit()) {
01139       setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v2i64, Custom);
01140       setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom);
01141     }
01142   }
01143 
01144   if (Subtarget->hasSSE2()) {
01145     setOperationAction(ISD::SRL,               MVT::v8i16, Custom);
01146     setOperationAction(ISD::SRL,               MVT::v16i8, Custom);
01147 
01148     setOperationAction(ISD::SHL,               MVT::v8i16, Custom);
01149     setOperationAction(ISD::SHL,               MVT::v16i8, Custom);
01150 
01151     setOperationAction(ISD::SRA,               MVT::v8i16, Custom);
01152     setOperationAction(ISD::SRA,               MVT::v16i8, Custom);
01153 
01154     // In the customized shift lowering, the legal cases in AVX2 will be
01155     // recognized.
01156     setOperationAction(ISD::SRL,               MVT::v2i64, Custom);
01157     setOperationAction(ISD::SRL,               MVT::v4i32, Custom);
01158 
01159     setOperationAction(ISD::SHL,               MVT::v2i64, Custom);
01160     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
01161 
01162     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
01163   }
01164 
01165   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
01166     addRegisterClass(MVT::v32i8,  &X86::VR256RegClass);
01167     addRegisterClass(MVT::v16i16, &X86::VR256RegClass);
01168     addRegisterClass(MVT::v8i32,  &X86::VR256RegClass);
01169     addRegisterClass(MVT::v8f32,  &X86::VR256RegClass);
01170     addRegisterClass(MVT::v4i64,  &X86::VR256RegClass);
01171     addRegisterClass(MVT::v4f64,  &X86::VR256RegClass);
01172 
01173     setOperationAction(ISD::LOAD,               MVT::v8f32, Legal);
01174     setOperationAction(ISD::LOAD,               MVT::v4f64, Legal);
01175     setOperationAction(ISD::LOAD,               MVT::v4i64, Legal);
01176 
01177     setOperationAction(ISD::FADD,               MVT::v8f32, Legal);
01178     setOperationAction(ISD::FSUB,               MVT::v8f32, Legal);
01179     setOperationAction(ISD::FMUL,               MVT::v8f32, Legal);
01180     setOperationAction(ISD::FDIV,               MVT::v8f32, Legal);
01181     setOperationAction(ISD::FSQRT,              MVT::v8f32, Legal);
01182     setOperationAction(ISD::FFLOOR,             MVT::v8f32, Legal);
01183     setOperationAction(ISD::FCEIL,              MVT::v8f32, Legal);
01184     setOperationAction(ISD::FTRUNC,             MVT::v8f32, Legal);
01185     setOperationAction(ISD::FRINT,              MVT::v8f32, Legal);
01186     setOperationAction(ISD::FNEARBYINT,         MVT::v8f32, Legal);
01187     setOperationAction(ISD::FNEG,               MVT::v8f32, Custom);
01188     setOperationAction(ISD::FABS,               MVT::v8f32, Custom);
01189 
01190     setOperationAction(ISD::FADD,               MVT::v4f64, Legal);
01191     setOperationAction(ISD::FSUB,               MVT::v4f64, Legal);
01192     setOperationAction(ISD::FMUL,               MVT::v4f64, Legal);
01193     setOperationAction(ISD::FDIV,               MVT::v4f64, Legal);
01194     setOperationAction(ISD::FSQRT,              MVT::v4f64, Legal);
01195     setOperationAction(ISD::FFLOOR,             MVT::v4f64, Legal);
01196     setOperationAction(ISD::FCEIL,              MVT::v4f64, Legal);
01197     setOperationAction(ISD::FTRUNC,             MVT::v4f64, Legal);
01198     setOperationAction(ISD::FRINT,              MVT::v4f64, Legal);
01199     setOperationAction(ISD::FNEARBYINT,         MVT::v4f64, Legal);
01200     setOperationAction(ISD::FNEG,               MVT::v4f64, Custom);
01201     setOperationAction(ISD::FABS,               MVT::v4f64, Custom);
01202 
01203     // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
01204     // even though v8i16 is a legal type.
01205     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i16, Promote);
01206     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i16, Promote);
01207     setOperationAction(ISD::FP_TO_SINT,         MVT::v8i32, Legal);
01208 
01209     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i16, Promote);
01210     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
01211     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
01212 
01213     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
01214     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
01215 
01216     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
01217 
01218     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
01219     setOperationAction(ISD::SRL,               MVT::v32i8, Custom);
01220 
01221     setOperationAction(ISD::SHL,               MVT::v16i16, Custom);
01222     setOperationAction(ISD::SHL,               MVT::v32i8, Custom);
01223 
01224     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
01225     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
01226 
01227     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
01228     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
01229     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
01230     setOperationAction(ISD::SETCC,             MVT::v4i64, Custom);
01231 
01232     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
01233     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
01234     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
01235 
01236     setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
01237     setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
01238     setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
01239     setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
01240 
01241     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
01242     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
01243     setOperationAction(ISD::SIGN_EXTEND,       MVT::v16i16, Custom);
01244     setOperationAction(ISD::ZERO_EXTEND,       MVT::v4i64, Custom);
01245     setOperationAction(ISD::ZERO_EXTEND,       MVT::v8i32, Custom);
01246     setOperationAction(ISD::ZERO_EXTEND,       MVT::v16i16, Custom);
01247     setOperationAction(ISD::ANY_EXTEND,        MVT::v4i64, Custom);
01248     setOperationAction(ISD::ANY_EXTEND,        MVT::v8i32, Custom);
01249     setOperationAction(ISD::ANY_EXTEND,        MVT::v16i16, Custom);
01250     setOperationAction(ISD::TRUNCATE,          MVT::v16i8, Custom);
01251     setOperationAction(ISD::TRUNCATE,          MVT::v8i16, Custom);
01252     setOperationAction(ISD::TRUNCATE,          MVT::v4i32, Custom);
01253 
01254     if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
01255       setOperationAction(ISD::FMA,             MVT::v8f32, Legal);
01256       setOperationAction(ISD::FMA,             MVT::v4f64, Legal);
01257       setOperationAction(ISD::FMA,             MVT::v4f32, Legal);
01258       setOperationAction(ISD::FMA,             MVT::v2f64, Legal);
01259       setOperationAction(ISD::FMA,             MVT::f32, Legal);
01260       setOperationAction(ISD::FMA,             MVT::f64, Legal);
01261     }
01262 
01263     if (Subtarget->hasInt256()) {
01264       setOperationAction(ISD::ADD,             MVT::v4i64, Legal);
01265       setOperationAction(ISD::ADD,             MVT::v8i32, Legal);
01266       setOperationAction(ISD::ADD,             MVT::v16i16, Legal);
01267       setOperationAction(ISD::ADD,             MVT::v32i8, Legal);
01268 
01269       setOperationAction(ISD::SUB,             MVT::v4i64, Legal);
01270       setOperationAction(ISD::SUB,             MVT::v8i32, Legal);
01271       setOperationAction(ISD::SUB,             MVT::v16i16, Legal);
01272       setOperationAction(ISD::SUB,             MVT::v32i8, Legal);
01273 
01274       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01275       setOperationAction(ISD::MUL,             MVT::v8i32, Legal);
01276       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
01277       // Don't lower v32i8 because there is no 128-bit byte mul
01278 
01279       setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
01280       setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
01281       setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
01282       setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
01283 
01284       setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
01285       setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
01286 
01287       // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
01288       // when we have a 256bit-wide blend with immediate.
01289       setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
01290     } else {
01291       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
01292       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
01293       setOperationAction(ISD::ADD,             MVT::v16i16, Custom);
01294       setOperationAction(ISD::ADD,             MVT::v32i8, Custom);
01295 
01296       setOperationAction(ISD::SUB,             MVT::v4i64, Custom);
01297       setOperationAction(ISD::SUB,             MVT::v8i32, Custom);
01298       setOperationAction(ISD::SUB,             MVT::v16i16, Custom);
01299       setOperationAction(ISD::SUB,             MVT::v32i8, Custom);
01300 
01301       setOperationAction(ISD::MUL,             MVT::v4i64, Custom);
01302       setOperationAction(ISD::MUL,             MVT::v8i32, Custom);
01303       setOperationAction(ISD::MUL,             MVT::v16i16, Custom);
01304       // Don't lower v32i8 because there is no 128-bit byte mul
01305     }
01306 
01307     // In the customized shift lowering, the legal cases in AVX2 will be
01308     // recognized.
01309     setOperationAction(ISD::SRL,               MVT::v4i64, Custom);
01310     setOperationAction(ISD::SRL,               MVT::v8i32, Custom);
01311 
01312     setOperationAction(ISD::SHL,               MVT::v4i64, Custom);
01313     setOperationAction(ISD::SHL,               MVT::v8i32, Custom);
01314 
01315     setOperationAction(ISD::SRA,               MVT::v8i32, Custom);
01316 
01317     // Custom lower several nodes for 256-bit types.
01318     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01319              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01320       MVT VT = (MVT::SimpleValueType)i;
01321 
01322       // Extract subvector is special because the value type
01323       // (result) is 128-bit but the source is 256-bit wide.
01324       if (VT.is128BitVector()) {
01325         if (VT.getScalarSizeInBits() >= 32) {
01326           setOperationAction(ISD::MLOAD,  VT, Custom);
01327           setOperationAction(ISD::MSTORE, VT, Custom);
01328         }
01329         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01330       }
01331       // Do not attempt to custom lower other non-256-bit vectors
01332       if (!VT.is256BitVector())
01333         continue;
01334 
01335       if (VT.getScalarSizeInBits() >= 32) {
01336         setOperationAction(ISD::MLOAD,  VT, Legal);
01337         setOperationAction(ISD::MSTORE, VT, Legal);
01338       }
01339       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
01340       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
01341       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
01342       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
01343       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
01344       setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
01345       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
01346     }
01347 
01348     // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
01349     for (int i = MVT::v32i8; i != MVT::v4i64; ++i) {
01350       MVT VT = (MVT::SimpleValueType)i;
01351 
01352       // Do not attempt to promote non-256-bit vectors
01353       if (!VT.is256BitVector())
01354         continue;
01355 
01356       setOperationAction(ISD::AND,    VT, Promote);
01357       AddPromotedToType (ISD::AND,    VT, MVT::v4i64);
01358       setOperationAction(ISD::OR,     VT, Promote);
01359       AddPromotedToType (ISD::OR,     VT, MVT::v4i64);
01360       setOperationAction(ISD::XOR,    VT, Promote);
01361       AddPromotedToType (ISD::XOR,    VT, MVT::v4i64);
01362       setOperationAction(ISD::LOAD,   VT, Promote);
01363       AddPromotedToType (ISD::LOAD,   VT, MVT::v4i64);
01364       setOperationAction(ISD::SELECT, VT, Promote);
01365       AddPromotedToType (ISD::SELECT, VT, MVT::v4i64);
01366     }
01367   }
01368 
01369   if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) {
01370     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
01371     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
01372     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
01373     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
01374 
01375     addRegisterClass(MVT::i1,     &X86::VK1RegClass);
01376     addRegisterClass(MVT::v8i1,   &X86::VK8RegClass);
01377     addRegisterClass(MVT::v16i1,  &X86::VK16RegClass);
01378 
01379     setOperationAction(ISD::BR_CC,              MVT::i1,    Expand);
01380     setOperationAction(ISD::SETCC,              MVT::i1,    Custom);
01381     setOperationAction(ISD::XOR,                MVT::i1,    Legal);
01382     setOperationAction(ISD::OR,                 MVT::i1,    Legal);
01383     setOperationAction(ISD::AND,                MVT::i1,    Legal);
01384     setLoadExtAction(ISD::EXTLOAD,              MVT::v8f32, Legal);
01385     setOperationAction(ISD::LOAD,               MVT::v16f32, Legal);
01386     setOperationAction(ISD::LOAD,               MVT::v8f64, Legal);
01387     setOperationAction(ISD::LOAD,               MVT::v8i64, Legal);
01388     setOperationAction(ISD::LOAD,               MVT::v16i32, Legal);
01389     setOperationAction(ISD::LOAD,               MVT::v16i1, Legal);
01390 
01391     setOperationAction(ISD::FADD,               MVT::v16f32, Legal);
01392     setOperationAction(ISD::FSUB,               MVT::v16f32, Legal);
01393     setOperationAction(ISD::FMUL,               MVT::v16f32, Legal);
01394     setOperationAction(ISD::FDIV,               MVT::v16f32, Legal);
01395     setOperationAction(ISD::FSQRT,              MVT::v16f32, Legal);
01396     setOperationAction(ISD::FNEG,               MVT::v16f32, Custom);
01397 
01398     setOperationAction(ISD::FADD,               MVT::v8f64, Legal);
01399     setOperationAction(ISD::FSUB,               MVT::v8f64, Legal);
01400     setOperationAction(ISD::FMUL,               MVT::v8f64, Legal);
01401     setOperationAction(ISD::FDIV,               MVT::v8f64, Legal);
01402     setOperationAction(ISD::FSQRT,              MVT::v8f64, Legal);
01403     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
01404     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
01405     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
01406 
01407     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
01408     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
01409     setOperationAction(ISD::SINT_TO_FP,         MVT::i32, Legal);
01410     setOperationAction(ISD::UINT_TO_FP,         MVT::i32, Legal);
01411     if (Subtarget->is64Bit()) {
01412       setOperationAction(ISD::FP_TO_UINT,       MVT::i64, Legal);
01413       setOperationAction(ISD::FP_TO_SINT,       MVT::i64, Legal);
01414       setOperationAction(ISD::SINT_TO_FP,       MVT::i64, Legal);
01415       setOperationAction(ISD::UINT_TO_FP,       MVT::i64, Legal);
01416     }
01417     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
01418     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
01419     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
01420     setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
01421     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
01422     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i1,   Custom);
01423     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i1,  Custom);
01424     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i8,  Promote);
01425     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i16, Promote);
01426     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
01427     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
01428     setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
01429     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
01430     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
01431 
01432     setOperationAction(ISD::TRUNCATE,           MVT::i1, Custom);
01433     setOperationAction(ISD::TRUNCATE,           MVT::v16i8, Custom);
01434     setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
01435     setOperationAction(ISD::TRUNCATE,           MVT::v8i1, Custom);
01436     setOperationAction(ISD::TRUNCATE,           MVT::v16i1, Custom);
01437     setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
01438     setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
01439     setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
01440     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
01441     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
01442     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i8, Custom);
01443     setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i16, Custom);
01444     setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i16, Custom);
01445 
01446     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
01447     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
01448     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
01449     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
01450     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i1,    Custom);
01451     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i1, Legal);
01452 
01453     setOperationAction(ISD::SETCC,              MVT::v16i1, Custom);
01454     setOperationAction(ISD::SETCC,              MVT::v8i1, Custom);
01455 
01456     setOperationAction(ISD::MUL,              MVT::v8i64, Custom);
01457 
01458     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
01459     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
01460     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
01461     setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
01462     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
01463     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
01464     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
01465     setOperationAction(ISD::SELECT,             MVT::v8i64, Custom);
01466     setOperationAction(ISD::SELECT,             MVT::v16f32, Custom);
01467 
01468     setOperationAction(ISD::ADD,                MVT::v8i64, Legal);
01469     setOperationAction(ISD::ADD,                MVT::v16i32, Legal);
01470 
01471     setOperationAction(ISD::SUB,                MVT::v8i64, Legal);
01472     setOperationAction(ISD::SUB,                MVT::v16i32, Legal);
01473 
01474     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
01475 
01476     setOperationAction(ISD::SRL,                MVT::v8i64, Custom);
01477     setOperationAction(ISD::SRL,                MVT::v16i32, Custom);
01478 
01479     setOperationAction(ISD::SHL,                MVT::v8i64, Custom);
01480     setOperationAction(ISD::SHL,                MVT::v16i32, Custom);
01481 
01482     setOperationAction(ISD::SRA,                MVT::v8i64, Custom);
01483     setOperationAction(ISD::SRA,                MVT::v16i32, Custom);
01484 
01485     setOperationAction(ISD::AND,                MVT::v8i64, Legal);
01486     setOperationAction(ISD::OR,                 MVT::v8i64, Legal);
01487     setOperationAction(ISD::XOR,                MVT::v8i64, Legal);
01488     setOperationAction(ISD::AND,                MVT::v16i32, Legal);
01489     setOperationAction(ISD::OR,                 MVT::v16i32, Legal);
01490     setOperationAction(ISD::XOR,                MVT::v16i32, Legal);
01491 
01492     if (Subtarget->hasCDI()) {
01493       setOperationAction(ISD::CTLZ,             MVT::v8i64, Legal);
01494       setOperationAction(ISD::CTLZ,             MVT::v16i32, Legal);
01495     }
01496 
01497     // Custom lower several nodes.
01498     for (int i = MVT::FIRST_VECTOR_VALUETYPE;
01499              i <= MVT::LAST_VECTOR_VALUETYPE; ++i) {
01500       MVT VT = (MVT::SimpleValueType)i;
01501 
01502       unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01503       // Extract subvector is special because the value type
01504       // (result) is 256/128-bit but the source is 512-bit wide.
01505       if (VT.is128BitVector() || VT.is256BitVector()) {
01506         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
01507         if ( EltSize >= 32) {
01508           setOperationAction(ISD::MLOAD,   VT, Legal);
01509           setOperationAction(ISD::MSTORE,  VT, Legal);
01510         }
01511       }
01512       if (VT.getVectorElementType() == MVT::i1)
01513         setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
01514 
01515       // Do not attempt to custom lower other non-512-bit vectors
01516       if (!VT.is512BitVector())
01517         continue;
01518 
01519       if ( EltSize >= 32) {
01520         setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
01521         setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
01522         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01523         setOperationAction(ISD::VSELECT,             VT, Legal);
01524         setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
01525         setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
01526         setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
01527         setOperationAction(ISD::MLOAD,               VT, Legal);
01528         setOperationAction(ISD::MSTORE,              VT, Legal);
01529       }
01530     }
01531     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01532       MVT VT = (MVT::SimpleValueType)i;
01533 
01534       // Do not attempt to promote non-256-bit vectors
01535       if (!VT.is512BitVector())
01536         continue;
01537 
01538       setOperationAction(ISD::SELECT, VT, Promote);
01539       AddPromotedToType (ISD::SELECT, VT, MVT::v8i64);
01540     }
01541   }// has  AVX-512
01542 
01543   if (!TM.Options.UseSoftFloat && Subtarget->hasBWI()) {
01544     addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
01545     addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
01546 
01547     addRegisterClass(MVT::v32i1,  &X86::VK32RegClass);
01548     addRegisterClass(MVT::v64i1,  &X86::VK64RegClass);
01549 
01550     setOperationAction(ISD::LOAD,               MVT::v32i16, Legal);
01551     setOperationAction(ISD::LOAD,               MVT::v64i8, Legal);
01552     setOperationAction(ISD::SETCC,              MVT::v32i1, Custom);
01553     setOperationAction(ISD::SETCC,              MVT::v64i1, Custom);
01554 
01555     for (int i = MVT::v32i8; i != MVT::v8i64; ++i) {
01556       const MVT VT = (MVT::SimpleValueType)i;
01557 
01558       const unsigned EltSize = VT.getVectorElementType().getSizeInBits();
01559 
01560       // Do not attempt to promote non-256-bit vectors
01561       if (!VT.is512BitVector())
01562         continue;
01563 
01564       if ( EltSize < 32) {
01565         setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
01566         setOperationAction(ISD::VSELECT,             VT, Legal);
01567       }
01568     }
01569   }
01570 
01571   if (!TM.Options.UseSoftFloat && Subtarget->hasVLX()) {
01572     addRegisterClass(MVT::v4i1,   &X86::VK4RegClass);
01573     addRegisterClass(MVT::v2i1,   &X86::VK2RegClass);
01574 
01575     setOperationAction(ISD::SETCC,              MVT::v4i1, Custom);
01576     setOperationAction(ISD::SETCC,              MVT::v2i1, Custom);
01577     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v8i1, Legal);
01578   }
01579 
01580   // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion
01581   // of this type with custom code.
01582   for (int VT = MVT::FIRST_VECTOR_VALUETYPE;
01583            VT != MVT::LAST_VECTOR_VALUETYPE; VT++) {
01584     setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
01585                        Custom);
01586   }
01587 
01588   // We want to custom lower some of our intrinsics.
01589   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
01590   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
01591   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
01592   if (!Subtarget->is64Bit())
01593     setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
01594 
01595   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
01596   // handle type legalization for these operations here.
01597   //
01598   // FIXME: We really should do custom legalization for addition and
01599   // subtraction on x86-32 once PR3203 is fixed.  We really can't do much better
01600   // than generic legalization for 64-bit multiplication-with-overflow, though.
01601   for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) {
01602     // Add/Sub/Mul with overflow operations are custom lowered.
01603     MVT VT = IntVTs[i];
01604     setOperationAction(ISD::SADDO, VT, Custom);
01605     setOperationAction(ISD::UADDO, VT, Custom);
01606     setOperationAction(ISD::SSUBO, VT, Custom);
01607     setOperationAction(ISD::USUBO, VT, Custom);
01608     setOperationAction(ISD::SMULO, VT, Custom);
01609     setOperationAction(ISD::UMULO, VT, Custom);
01610   }
01611 
01612 
01613   if (!Subtarget->is64Bit()) {
01614     // These libcalls are not available in 32-bit.
01615     setLibcallName(RTLIB::SHL_I128, nullptr);
01616     setLibcallName(RTLIB::SRL_I128, nullptr);
01617     setLibcallName(RTLIB::SRA_I128, nullptr);
01618   }
01619 
01620   // Combine sin / cos into one node or libcall if possible.
01621   if (Subtarget->hasSinCos()) {
01622     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
01623     setLibcallName(RTLIB::SINCOS_F64, "sincos");
01624     if (Subtarget->isTargetDarwin()) {
01625       // For MacOSX, we don't want to the normal expansion of a libcall to
01626       // sincos. We want to issue a libcall to __sincos_stret to avoid memory
01627       // traffic.
01628       setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
01629       setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
01630     }
01631   }
01632 
01633   if (Subtarget->isTargetWin64()) {
01634     setOperationAction(ISD::SDIV, MVT::i128, Custom);
01635     setOperationAction(ISD::UDIV, MVT::i128, Custom);
01636     setOperationAction(ISD::SREM, MVT::i128, Custom);
01637     setOperationAction(ISD::UREM, MVT::i128, Custom);
01638     setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
01639     setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
01640   }
01641 
01642   // We have target-specific dag combine patterns for the following nodes:
01643   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
01644   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
01645   setTargetDAGCombine(ISD::VSELECT);
01646   setTargetDAGCombine(ISD::SELECT);
01647   setTargetDAGCombine(ISD::SHL);
01648   setTargetDAGCombine(ISD::SRA);
01649   setTargetDAGCombine(ISD::SRL);
01650   setTargetDAGCombine(ISD::OR);
01651   setTargetDAGCombine(ISD::AND);
01652   setTargetDAGCombine(ISD::ADD);
01653   setTargetDAGCombine(ISD::FADD);
01654   setTargetDAGCombine(ISD::FSUB);
01655   setTargetDAGCombine(ISD::FMA);
01656   setTargetDAGCombine(ISD::SUB);
01657   setTargetDAGCombine(ISD::LOAD);
01658   setTargetDAGCombine(ISD::STORE);
01659   setTargetDAGCombine(ISD::ZERO_EXTEND);
01660   setTargetDAGCombine(ISD::ANY_EXTEND);
01661   setTargetDAGCombine(ISD::SIGN_EXTEND);
01662   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
01663   setTargetDAGCombine(ISD::TRUNCATE);
01664   setTargetDAGCombine(ISD::SINT_TO_FP);
01665   setTargetDAGCombine(ISD::SETCC);
01666   setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
01667   setTargetDAGCombine(ISD::BUILD_VECTOR);
01668   if (Subtarget->is64Bit())
01669     setTargetDAGCombine(ISD::MUL);
01670   setTargetDAGCombine(ISD::XOR);
01671 
01672   computeRegisterProperties();
01673 
01674   // On Darwin, -Os means optimize for size without hurting performance,
01675   // do not reduce the limit.
01676   MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
01677   MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
01678   MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
01679   MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01680   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
01681   MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
01682   setPrefLoopAlignment(4); // 2^4 bytes.
01683 
01684   // Predictable cmov don't hurt on atom because it's in-order.
01685   PredictableSelectIsExpensive = !Subtarget->isAtom();
01686 
01687   setPrefFunctionAlignment(4); // 2^4 bytes.
01688 
01689   verifyIntrinsicTables();
01690 }
01691 
01692 // This has so far only been implemented for 64-bit MachO.
01693 bool X86TargetLowering::useLoadStackGuardNode() const {
01694   return Subtarget->getTargetTriple().getObjectFormat() == Triple::MachO &&
01695          Subtarget->is64Bit();
01696 }
01697 
01698 TargetLoweringBase::LegalizeTypeAction
01699 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
01700   if (ExperimentalVectorWideningLegalization &&
01701       VT.getVectorNumElements() != 1 &&
01702       VT.getVectorElementType().getSimpleVT() != MVT::i1)
01703     return TypeWidenVector;
01704 
01705   return TargetLoweringBase::getPreferredVectorAction(VT);
01706 }
01707 
01708 EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
01709   if (!VT.isVector())
01710     return Subtarget->hasAVX512() ? MVT::i1: MVT::i8;
01711 
01712   const unsigned NumElts = VT.getVectorNumElements();
01713   const EVT EltVT = VT.getVectorElementType();
01714   if (VT.is512BitVector()) {
01715     if (Subtarget->hasAVX512())
01716       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01717           EltVT == MVT::f32 || EltVT == MVT::f64)
01718         switch(NumElts) {
01719         case  8: return MVT::v8i1;
01720         case 16: return MVT::v16i1;
01721       }
01722     if (Subtarget->hasBWI())
01723       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01724         switch(NumElts) {
01725         case 32: return MVT::v32i1;
01726         case 64: return MVT::v64i1;
01727       }
01728   }
01729 
01730   if (VT.is256BitVector() || VT.is128BitVector()) {
01731     if (Subtarget->hasVLX())
01732       if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
01733           EltVT == MVT::f32 || EltVT == MVT::f64)
01734         switch(NumElts) {
01735         case 2: return MVT::v2i1;
01736         case 4: return MVT::v4i1;
01737         case 8: return MVT::v8i1;
01738       }
01739     if (Subtarget->hasBWI() && Subtarget->hasVLX())
01740       if (EltVT == MVT::i8 || EltVT == MVT::i16)
01741         switch(NumElts) {
01742         case  8: return MVT::v8i1;
01743         case 16: return MVT::v16i1;
01744         case 32: return MVT::v32i1;
01745       }
01746   }
01747 
01748   return VT.changeVectorElementTypeToInteger();
01749 }
01750 
01751 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
01752 /// the desired ByVal argument alignment.
01753 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
01754   if (MaxAlign == 16)
01755     return;
01756   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
01757     if (VTy->getBitWidth() == 128)
01758       MaxAlign = 16;
01759   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
01760     unsigned EltAlign = 0;
01761     getMaxByValAlign(ATy->getElementType(), EltAlign);
01762     if (EltAlign > MaxAlign)
01763       MaxAlign = EltAlign;
01764   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
01765     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
01766       unsigned EltAlign = 0;
01767       getMaxByValAlign(STy->getElementType(i), EltAlign);
01768       if (EltAlign > MaxAlign)
01769         MaxAlign = EltAlign;
01770       if (MaxAlign == 16)
01771         break;
01772     }
01773   }
01774 }
01775 
01776 /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
01777 /// function arguments in the caller parameter area. For X86, aggregates
01778 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
01779 /// are at 4-byte boundaries.
01780 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
01781   if (Subtarget->is64Bit()) {
01782     // Max of 8 and alignment of type.
01783     unsigned TyAlign = TD->getABITypeAlignment(Ty);
01784     if (TyAlign > 8)
01785       return TyAlign;
01786     return 8;
01787   }
01788 
01789   unsigned Align = 4;
01790   if (Subtarget->hasSSE1())
01791     getMaxByValAlign(Ty, Align);
01792   return Align;
01793 }
01794 
01795 /// getOptimalMemOpType - Returns the target specific optimal type for load
01796 /// and store operations as a result of memset, memcpy, and memmove
01797 /// lowering. If DstAlign is zero that means it's safe to destination
01798 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
01799 /// means there isn't a need to check it against alignment requirement,
01800 /// probably because the source does not need to be loaded. If 'IsMemset' is
01801 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
01802 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
01803 /// source is constant so it does not need to be loaded.
01804 /// It returns EVT::Other if the type should be determined using generic
01805 /// target-independent logic.
01806 EVT
01807 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
01808                                        unsigned DstAlign, unsigned SrcAlign,
01809                                        bool IsMemset, bool ZeroMemset,
01810                                        bool MemcpyStrSrc,
01811                                        MachineFunction &MF) const {
01812   const Function *F = MF.getFunction();
01813   if ((!IsMemset || ZeroMemset) &&
01814       !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
01815                                        Attribute::NoImplicitFloat)) {
01816     if (Size >= 16 &&
01817         (Subtarget->isUnalignedMemAccessFast() ||
01818          ((DstAlign == 0 || DstAlign >= 16) &&
01819           (SrcAlign == 0 || SrcAlign >= 16)))) {
01820       if (Size >= 32) {
01821         if (Subtarget->hasInt256())
01822           return MVT::v8i32;
01823         if (Subtarget->hasFp256())
01824           return MVT::v8f32;
01825       }
01826       if (Subtarget->hasSSE2())
01827         return MVT::v4i32;
01828       if (Subtarget->hasSSE1())
01829         return MVT::v4f32;
01830     } else if (!MemcpyStrSrc && Size >= 8 &&
01831                !Subtarget->is64Bit() &&
01832                Subtarget->hasSSE2()) {
01833       // Do not use f64 to lower memcpy if source is string constant. It's
01834       // better to use i32 to avoid the loads.
01835       return MVT::f64;
01836     }
01837   }
01838   if (Subtarget->is64Bit() && Size >= 8)
01839     return MVT::i64;
01840   return MVT::i32;
01841 }
01842 
01843 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
01844   if (VT == MVT::f32)
01845     return X86ScalarSSEf32;
01846   else if (VT == MVT::f64)
01847     return X86ScalarSSEf64;
01848   return true;
01849 }
01850 
01851 bool
01852 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
01853                                                   unsigned,
01854                                                   unsigned,
01855                                                   bool *Fast) const {
01856   if (Fast)
01857     *Fast = Subtarget->isUnalignedMemAccessFast();
01858   return true;
01859 }
01860 
01861 /// getJumpTableEncoding - Return the entry encoding for a jump table in the
01862 /// current function.  The returned value is a member of the
01863 /// MachineJumpTableInfo::JTEntryKind enum.
01864 unsigned X86TargetLowering::getJumpTableEncoding() const {
01865   // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
01866   // symbol.
01867   if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
01868       Subtarget->isPICStyleGOT())
01869     return MachineJumpTableInfo::EK_Custom32;
01870 
01871   // Otherwise, use the normal jump table encoding heuristics.
01872   return TargetLowering::getJumpTableEncoding();
01873 }
01874 
01875 const MCExpr *
01876 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
01877                                              const MachineBasicBlock *MBB,
01878                                              unsigned uid,MCContext &Ctx) const{
01879   assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ &&
01880          Subtarget->isPICStyleGOT());
01881   // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
01882   // entries.
01883   return MCSymbolRefExpr::Create(MBB->getSymbol(),
01884                                  MCSymbolRefExpr::VK_GOTOFF, Ctx);
01885 }
01886 
01887 /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC
01888 /// jumptable.
01889 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
01890                                                     SelectionDAG &DAG) const {
01891   if (!Subtarget->is64Bit())
01892     // This doesn't have SDLoc associated with it, but is not really the
01893     // same as a Register.
01894     return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy());
01895   return Table;
01896 }
01897 
01898 /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the
01899 /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an
01900 /// MCExpr.
01901 const MCExpr *X86TargetLowering::
01902 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
01903                              MCContext &Ctx) const {
01904   // X86-64 uses RIP relative addressing based on the jump table label.
01905   if (Subtarget->isPICStyleRIPRel())
01906     return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
01907 
01908   // Otherwise, the reference is relative to the PIC base.
01909   return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx);
01910 }
01911 
01912 // FIXME: Why this routine is here? Move to RegInfo!
01913 std::pair<const TargetRegisterClass*, uint8_t>
01914 X86TargetLowering::findRepresentativeClass(MVT VT) const{
01915   const TargetRegisterClass *RRC = nullptr;
01916   uint8_t Cost = 1;
01917   switch (VT.SimpleTy) {
01918   default:
01919     return TargetLowering::findRepresentativeClass(VT);
01920   case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
01921     RRC = Subtarget->is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
01922     break;
01923   case MVT::x86mmx:
01924     RRC = &X86::VR64RegClass;
01925     break;
01926   case MVT::f32: case MVT::f64:
01927   case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
01928   case MVT::v4f32: case MVT::v2f64:
01929   case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32:
01930   case MVT::v4f64:
01931     RRC = &X86::VR128RegClass;
01932     break;
01933   }
01934   return std::make_pair(RRC, Cost);
01935 }
01936 
01937 bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
01938                                                unsigned &Offset) const {
01939   if (!Subtarget->isTargetLinux())
01940     return false;
01941 
01942   if (Subtarget->is64Bit()) {
01943     // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
01944     Offset = 0x28;
01945     if (getTargetMachine().getCodeModel() == CodeModel::Kernel)
01946       AddressSpace = 256;
01947     else
01948       AddressSpace = 257;
01949   } else {
01950     // %gs:0x14 on i386
01951     Offset = 0x14;
01952     AddressSpace = 256;
01953   }
01954   return true;
01955 }
01956 
01957 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
01958                                             unsigned DestAS) const {
01959   assert(SrcAS != DestAS && "Expected different address spaces!");
01960 
01961   return SrcAS < 256 && DestAS < 256;
01962 }
01963 
01964 //===----------------------------------------------------------------------===//
01965 //               Return Value Calling Convention Implementation
01966 //===----------------------------------------------------------------------===//
01967 
01968 #include "X86GenCallingConv.inc"
01969 
01970 bool
01971 X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
01972                                   MachineFunction &MF, bool isVarArg,
01973                         const SmallVectorImpl<ISD::OutputArg> &Outs,
01974                         LLVMContext &Context) const {
01975   SmallVector<CCValAssign, 16> RVLocs;
01976   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
01977   return CCInfo.CheckReturn(Outs, RetCC_X86);
01978 }
01979 
01980 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
01981   static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
01982   return ScratchRegs;
01983 }
01984 
01985 SDValue
01986 X86TargetLowering::LowerReturn(SDValue Chain,
01987                                CallingConv::ID CallConv, bool isVarArg,
01988                                const SmallVectorImpl<ISD::OutputArg> &Outs,
01989                                const SmallVectorImpl<SDValue> &OutVals,
01990                                SDLoc dl, SelectionDAG &DAG) const {
01991   MachineFunction &MF = DAG.getMachineFunction();
01992   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
01993 
01994   SmallVector<CCValAssign, 16> RVLocs;
01995   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
01996   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
01997 
01998   SDValue Flag;
01999   SmallVector<SDValue, 6> RetOps;
02000   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
02001   // Operand #1 = Bytes To Pop
02002   RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(),
02003                    MVT::i16));
02004 
02005   // Copy the result values into the output registers.
02006   for (unsigned i = 0; i != RVLocs.size(); ++i) {
02007     CCValAssign &VA = RVLocs[i];
02008     assert(VA.isRegLoc() && "Can only return in registers!");
02009     SDValue ValToCopy = OutVals[i];
02010     EVT ValVT = ValToCopy.getValueType();
02011 
02012     // Promote values to the appropriate types
02013     if (VA.getLocInfo() == CCValAssign::SExt)
02014       ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
02015     else if (VA.getLocInfo() == CCValAssign::ZExt)
02016       ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
02017     else if (VA.getLocInfo() == CCValAssign::AExt)
02018       ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
02019     else if (VA.getLocInfo() == CCValAssign::BCvt)
02020       ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy);
02021 
02022     assert(VA.getLocInfo() != CCValAssign::FPExt &&
02023            "Unexpected FP-extend for return value.");  
02024 
02025     // If this is x86-64, and we disabled SSE, we can't return FP values,
02026     // or SSE or MMX vectors.
02027     if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
02028          VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
02029           (Subtarget->is64Bit() && !Subtarget->hasSSE1())) {
02030       report_fatal_error("SSE register return with SSE disabled");
02031     }
02032     // Likewise we can't return F64 values with SSE1 only.  gcc does so, but
02033     // llvm-gcc has never done it right and no one has noticed, so this
02034     // should be OK for now.
02035     if (ValVT == MVT::f64 &&
02036         (Subtarget->is64Bit() && !Subtarget->hasSSE2()))
02037       report_fatal_error("SSE2 register return with SSE2 disabled");
02038 
02039     // Returns in ST0/ST1 are handled specially: these are pushed as operands to
02040     // the RET instruction and handled by the FP Stackifier.
02041     if (VA.getLocReg() == X86::FP0 ||
02042         VA.getLocReg() == X86::FP1) {
02043       // If this is a copy from an xmm register to ST(0), use an FPExtend to
02044       // change the value to the FP stack register class.
02045       if (isScalarFPTypeInSSEReg(VA.getValVT()))
02046         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
02047       RetOps.push_back(ValToCopy);
02048       // Don't emit a copytoreg.
02049       continue;
02050     }
02051 
02052     // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
02053     // which is returned in RAX / RDX.
02054     if (Subtarget->is64Bit()) {
02055       if (ValVT == MVT::x86mmx) {
02056         if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
02057           ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy);
02058           ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
02059                                   ValToCopy);
02060           // If we don't have SSE2 available, convert to v4f32 so the generated
02061           // register is legal.
02062           if (!Subtarget->hasSSE2())
02063             ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy);
02064         }
02065       }
02066     }
02067 
02068     Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
02069     Flag = Chain.getValue(1);
02070     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
02071   }
02072 
02073   // The x86-64 ABIs require that for returning structs by value we copy
02074   // the sret argument into %rax/%eax (depending on ABI) for the return.
02075   // Win32 requires us to put the sret argument to %eax as well.
02076   // We saved the argument into a virtual register in the entry block,
02077   // so now we copy the value out and into %rax/%eax.
02078   if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() &&
02079       (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
02080     MachineFunction &MF = DAG.getMachineFunction();
02081     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02082     unsigned Reg = FuncInfo->getSRetReturnReg();
02083     assert(Reg &&
02084            "SRetReturnReg should have been set in LowerFormalArguments().");
02085     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
02086 
02087     unsigned RetValReg
02088         = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ?
02089           X86::RAX : X86::EAX;
02090     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
02091     Flag = Chain.getValue(1);
02092 
02093     // RAX/EAX now acts like a return value.
02094     RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy()));
02095   }
02096 
02097   RetOps[0] = Chain;  // Update chain.
02098 
02099   // Add the flag if we have it.
02100   if (Flag.getNode())
02101     RetOps.push_back(Flag);
02102 
02103   return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
02104 }
02105 
02106 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
02107   if (N->getNumValues() != 1)
02108     return false;
02109   if (!N->hasNUsesOfValue(1, 0))
02110     return false;
02111 
02112   SDValue TCChain = Chain;
02113   SDNode *Copy = *N->use_begin();
02114   if (Copy->getOpcode() == ISD::CopyToReg) {
02115     // If the copy has a glue operand, we conservatively assume it isn't safe to
02116     // perform a tail call.
02117     if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
02118       return false;
02119     TCChain = Copy->getOperand(0);
02120   } else if (Copy->getOpcode() != ISD::FP_EXTEND)
02121     return false;
02122 
02123   bool HasRet = false;
02124   for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
02125        UI != UE; ++UI) {
02126     if (UI->getOpcode() != X86ISD::RET_FLAG)
02127       return false;
02128     // If we are returning more than one value, we can definitely
02129     // not make a tail call see PR19530
02130     if (UI->getNumOperands() > 4)
02131       return false;
02132     if (UI->getNumOperands() == 4 &&
02133         UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
02134       return false;
02135     HasRet = true;
02136   }
02137 
02138   if (!HasRet)
02139     return false;
02140 
02141   Chain = TCChain;
02142   return true;
02143 }
02144 
02145 EVT
02146 X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
02147                                             ISD::NodeType ExtendKind) const {
02148   MVT ReturnMVT;
02149   // TODO: Is this also valid on 32-bit?
02150   if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND)
02151     ReturnMVT = MVT::i8;
02152   else
02153     ReturnMVT = MVT::i32;
02154 
02155   EVT MinVT = getRegisterType(Context, ReturnMVT);
02156   return VT.bitsLT(MinVT) ? MinVT : VT;
02157 }
02158 
02159 /// LowerCallResult - Lower the result values of a call into the
02160 /// appropriate copies out of appropriate physical registers.
02161 ///
02162 SDValue
02163 X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
02164                                    CallingConv::ID CallConv, bool isVarArg,
02165                                    const SmallVectorImpl<ISD::InputArg> &Ins,
02166                                    SDLoc dl, SelectionDAG &DAG,
02167                                    SmallVectorImpl<SDValue> &InVals) const {
02168 
02169   // Assign locations to each value returned by this call.
02170   SmallVector<CCValAssign, 16> RVLocs;
02171   bool Is64Bit = Subtarget->is64Bit();
02172   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
02173                  *DAG.getContext());
02174   CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
02175 
02176   // Copy all of the result registers out of their specified physreg.
02177   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
02178     CCValAssign &VA = RVLocs[i];
02179     EVT CopyVT = VA.getValVT();
02180 
02181     // If this is x86-64, and we disabled SSE, we can't return FP values
02182     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
02183         ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) {
02184       report_fatal_error("SSE register return with SSE disabled");
02185     }
02186 
02187     // If we prefer to use the value in xmm registers, copy it out as f80 and
02188     // use a truncate to move it from fp stack reg to xmm reg.
02189     if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
02190         isScalarFPTypeInSSEReg(VA.getValVT()))
02191       CopyVT = MVT::f80;
02192 
02193     Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(),
02194                                CopyVT, InFlag).getValue(1);
02195     SDValue Val = Chain.getValue(0);
02196 
02197     if (CopyVT != VA.getValVT())
02198       Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
02199                         // This truncation won't change the value.
02200                         DAG.getIntPtrConstant(1));
02201 
02202     InFlag = Chain.getValue(2);
02203     InVals.push_back(Val);
02204   }
02205 
02206   return Chain;
02207 }
02208 
02209 //===----------------------------------------------------------------------===//
02210 //                C & StdCall & Fast Calling Convention implementation
02211 //===----------------------------------------------------------------------===//
02212 //  StdCall calling convention seems to be standard for many Windows' API
02213 //  routines and around. It differs from C calling convention just a little:
02214 //  callee should clean up the stack, not caller. Symbols should be also
02215 //  decorated in some fancy way :) It doesn't support any vector arguments.
02216 //  For info on fast calling convention see Fast Calling Convention (tail call)
02217 //  implementation LowerX86_32FastCCCallTo.
02218 
02219 /// CallIsStructReturn - Determines whether a call uses struct return
02220 /// semantics.
02221 enum StructReturnType {
02222   NotStructReturn,
02223   RegStructReturn,
02224   StackStructReturn
02225 };
02226 static StructReturnType
02227 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs) {
02228   if (Outs.empty())
02229     return NotStructReturn;
02230 
02231   const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
02232   if (!Flags.isSRet())
02233     return NotStructReturn;
02234   if (Flags.isInReg())
02235     return RegStructReturn;
02236   return StackStructReturn;
02237 }
02238 
02239 /// ArgsAreStructReturn - Determines whether a function uses struct
02240 /// return semantics.
02241 static StructReturnType
02242 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins) {
02243   if (Ins.empty())
02244     return NotStructReturn;
02245 
02246   const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
02247   if (!Flags.isSRet())
02248     return NotStructReturn;
02249   if (Flags.isInReg())
02250     return RegStructReturn;
02251   return StackStructReturn;
02252 }
02253 
02254 /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified
02255 /// by "Src" to address "Dst" with size and alignment information specified by
02256 /// the specific parameter attribute. The copy will be passed as a byval
02257 /// function parameter.
02258 static SDValue
02259 CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
02260                           ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
02261                           SDLoc dl) {
02262   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
02263 
02264   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
02265                        /*isVolatile*/false, /*AlwaysInline=*/true,
02266                        MachinePointerInfo(), MachinePointerInfo());
02267 }
02268 
02269 /// IsTailCallConvention - Return true if the calling convention is one that
02270 /// supports tail call optimization.
02271 static bool IsTailCallConvention(CallingConv::ID CC) {
02272   return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
02273           CC == CallingConv::HiPE);
02274 }
02275 
02276 /// \brief Return true if the calling convention is a C calling convention.
02277 static bool IsCCallConvention(CallingConv::ID CC) {
02278   return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 ||
02279           CC == CallingConv::X86_64_SysV);
02280 }
02281 
02282 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
02283   if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls)
02284     return false;
02285 
02286   CallSite CS(CI);
02287   CallingConv::ID CalleeCC = CS.getCallingConv();
02288   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
02289     return false;
02290 
02291   return true;
02292 }
02293 
02294 /// FuncIsMadeTailCallSafe - Return true if the function is being made into
02295 /// a tailcall target by changing its ABI.
02296 static bool FuncIsMadeTailCallSafe(CallingConv::ID CC,
02297                                    bool GuaranteedTailCallOpt) {
02298   return GuaranteedTailCallOpt && IsTailCallConvention(CC);
02299 }
02300 
02301 SDValue
02302 X86TargetLowering::LowerMemArgument(SDValue Chain,
02303                                     CallingConv::ID CallConv,
02304                                     const SmallVectorImpl<ISD::InputArg> &Ins,
02305                                     SDLoc dl, SelectionDAG &DAG,
02306                                     const CCValAssign &VA,
02307                                     MachineFrameInfo *MFI,
02308                                     unsigned i) const {
02309   // Create the nodes corresponding to a load from this parameter slot.
02310   ISD::ArgFlagsTy Flags = Ins[i].Flags;
02311   bool AlwaysUseMutable = FuncIsMadeTailCallSafe(
02312       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
02313   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
02314   EVT ValVT;
02315 
02316   // If value is passed by pointer we have address passed instead of the value
02317   // itself.
02318   if (VA.getLocInfo() == CCValAssign::Indirect)
02319     ValVT = VA.getLocVT();
02320   else
02321     ValVT = VA.getValVT();
02322 
02323   // FIXME: For now, all byval parameter objects are marked mutable. This can be
02324   // changed with more analysis.
02325   // In case of tail call optimization mark all arguments mutable. Since they
02326   // could be overwritten by lowering of arguments in case of a tail call.
02327   if (Flags.isByVal()) {
02328     unsigned Bytes = Flags.getByValSize();
02329     if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
02330     int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
02331     return DAG.getFrameIndex(FI, getPointerTy());
02332   } else {
02333     int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8,
02334                                     VA.getLocMemOffset(), isImmutable);
02335     SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
02336     return DAG.getLoad(ValVT, dl, Chain, FIN,
02337                        MachinePointerInfo::getFixedStack(FI),
02338                        false, false, false, 0);
02339   }
02340 }
02341 
02342 // FIXME: Get this from tablegen.
02343 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
02344                                                 const X86Subtarget *Subtarget) {
02345   assert(Subtarget->is64Bit());
02346 
02347   if (Subtarget->isCallingConvWin64(CallConv)) {
02348     static const MCPhysReg GPR64ArgRegsWin64[] = {
02349       X86::RCX, X86::RDX, X86::R8,  X86::R9
02350     };
02351     return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
02352   }
02353 
02354   static const MCPhysReg GPR64ArgRegs64Bit[] = {
02355     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
02356   };
02357   return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
02358 }
02359 
02360 // FIXME: Get this from tablegen.
02361 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
02362                                                 CallingConv::ID CallConv,
02363                                                 const X86Subtarget *Subtarget) {
02364   assert(Subtarget->is64Bit());
02365   if (Subtarget->isCallingConvWin64(CallConv)) {
02366     // The XMM registers which might contain var arg parameters are shadowed
02367     // in their paired GPR.  So we only need to save the GPR to their home
02368     // slots.
02369     // TODO: __vectorcall will change this.
02370     return None;
02371   }
02372 
02373   const Function *Fn = MF.getFunction();
02374   bool NoImplicitFloatOps = Fn->getAttributes().
02375       hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
02376   assert(!(MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) &&
02377          "SSE register cannot be used when SSE is disabled!");
02378   if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps ||
02379       !Subtarget->hasSSE1())
02380     // Kernel mode asks for SSE to be disabled, so there are no XMM argument
02381     // registers.
02382     return None;
02383 
02384   static const MCPhysReg XMMArgRegs64Bit[] = {
02385     X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02386     X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02387   };
02388   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
02389 }
02390 
02391 SDValue
02392 X86TargetLowering::LowerFormalArguments(SDValue Chain,
02393                                         CallingConv::ID CallConv,
02394                                         bool isVarArg,
02395                                       const SmallVectorImpl<ISD::InputArg> &Ins,
02396                                         SDLoc dl,
02397                                         SelectionDAG &DAG,
02398                                         SmallVectorImpl<SDValue> &InVals)
02399                                           const {
02400   MachineFunction &MF = DAG.getMachineFunction();
02401   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
02402 
02403   const Function* Fn = MF.getFunction();
02404   if (Fn->hasExternalLinkage() &&
02405       Subtarget->isTargetCygMing() &&
02406       Fn->getName() == "main")
02407     FuncInfo->setForceFramePointer(true);
02408 
02409   MachineFrameInfo *MFI = MF.getFrameInfo();
02410   bool Is64Bit = Subtarget->is64Bit();
02411   bool IsWin64 = Subtarget->isCallingConvWin64(CallConv);
02412 
02413   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02414          "Var args not supported with calling convention fastcc, ghc or hipe");
02415 
02416   // Assign locations to all of the incoming arguments.
02417   SmallVector<CCValAssign, 16> ArgLocs;
02418   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02419 
02420   // Allocate shadow area for Win64
02421   if (IsWin64)
02422     CCInfo.AllocateStack(32, 8);
02423 
02424   CCInfo.AnalyzeFormalArguments(Ins, CC_X86);
02425 
02426   unsigned LastVal = ~0U;
02427   SDValue ArgValue;
02428   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02429     CCValAssign &VA = ArgLocs[i];
02430     // TODO: If an arg is passed in two places (e.g. reg and stack), skip later
02431     // places.
02432     assert(VA.getValNo() != LastVal &&
02433            "Don't support value assigned to multiple locs yet");
02434     (void)LastVal;
02435     LastVal = VA.getValNo();
02436 
02437     if (VA.isRegLoc()) {
02438       EVT RegVT = VA.getLocVT();
02439       const TargetRegisterClass *RC;
02440       if (RegVT == MVT::i32)
02441         RC = &X86::GR32RegClass;
02442       else if (Is64Bit && RegVT == MVT::i64)
02443         RC = &X86::GR64RegClass;
02444       else if (RegVT == MVT::f32)
02445         RC = &X86::FR32RegClass;
02446       else if (RegVT == MVT::f64)
02447         RC = &X86::FR64RegClass;
02448       else if (RegVT.is512BitVector())
02449         RC = &X86::VR512RegClass;
02450       else if (RegVT.is256BitVector())
02451         RC = &X86::VR256RegClass;
02452       else if (RegVT.is128BitVector())
02453         RC = &X86::VR128RegClass;
02454       else if (RegVT == MVT::x86mmx)
02455         RC = &X86::VR64RegClass;
02456       else if (RegVT == MVT::i1)
02457         RC = &X86::VK1RegClass;
02458       else if (RegVT == MVT::v8i1)
02459         RC = &X86::VK8RegClass;
02460       else if (RegVT == MVT::v16i1)
02461         RC = &X86::VK16RegClass;
02462       else if (RegVT == MVT::v32i1)
02463         RC = &X86::VK32RegClass;
02464       else if (RegVT == MVT::v64i1)
02465         RC = &X86::VK64RegClass;
02466       else
02467         llvm_unreachable("Unknown argument type!");
02468 
02469       unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
02470       ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
02471 
02472       // If this is an 8 or 16-bit value, it is really passed promoted to 32
02473       // bits.  Insert an assert[sz]ext to capture this, then truncate to the
02474       // right size.
02475       if (VA.getLocInfo() == CCValAssign::SExt)
02476         ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
02477                                DAG.getValueType(VA.getValVT()));
02478       else if (VA.getLocInfo() == CCValAssign::ZExt)
02479         ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
02480                                DAG.getValueType(VA.getValVT()));
02481       else if (VA.getLocInfo() == CCValAssign::BCvt)
02482         ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
02483 
02484       if (VA.isExtInLoc()) {
02485         // Handle MMX values passed in XMM regs.
02486         if (RegVT.isVector())
02487           ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
02488         else
02489           ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
02490       }
02491     } else {
02492       assert(VA.isMemLoc());
02493       ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i);
02494     }
02495 
02496     // If value is passed via pointer - do a load.
02497     if (VA.getLocInfo() == CCValAssign::Indirect)
02498       ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue,
02499                              MachinePointerInfo(), false, false, false, 0);
02500 
02501     InVals.push_back(ArgValue);
02502   }
02503 
02504   if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
02505     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02506       // The x86-64 ABIs require that for returning structs by value we copy
02507       // the sret argument into %rax/%eax (depending on ABI) for the return.
02508       // Win32 requires us to put the sret argument to %eax as well.
02509       // Save the argument into a virtual register so that we can access it
02510       // from the return points.
02511       if (Ins[i].Flags.isSRet()) {
02512         unsigned Reg = FuncInfo->getSRetReturnReg();
02513         if (!Reg) {
02514           MVT PtrTy = getPointerTy();
02515           Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
02516           FuncInfo->setSRetReturnReg(Reg);
02517         }
02518         SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
02519         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
02520         break;
02521       }
02522     }
02523   }
02524 
02525   unsigned StackSize = CCInfo.getNextStackOffset();
02526   // Align stack specially for tail calls.
02527   if (FuncIsMadeTailCallSafe(CallConv,
02528                              MF.getTarget().Options.GuaranteedTailCallOpt))
02529     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
02530 
02531   // If the function takes variable number of arguments, make a frame index for
02532   // the start of the first vararg value... for expansion of llvm.va_start. We
02533   // can skip this if there are no va_start calls.
02534   if (MFI->hasVAStart() &&
02535       (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
02536                    CallConv != CallingConv::X86_ThisCall))) {
02537     FuncInfo->setVarArgsFrameIndex(
02538         MFI->CreateFixedObject(1, StackSize, true));
02539   }
02540 
02541   // 64-bit calling conventions support varargs and register parameters, so we
02542   // have to do extra work to spill them in the prologue or forward them to
02543   // musttail calls.
02544   if (Is64Bit && isVarArg &&
02545       (MFI->hasVAStart() || MFI->hasMustTailInVarArgFunc())) {
02546     // Find the first unallocated argument registers.
02547     ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
02548     ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
02549     unsigned NumIntRegs =
02550         CCInfo.getFirstUnallocated(ArgGPRs.data(), ArgGPRs.size());
02551     unsigned NumXMMRegs =
02552         CCInfo.getFirstUnallocated(ArgXMMs.data(), ArgXMMs.size());
02553     assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
02554            "SSE register cannot be used when SSE is disabled!");
02555 
02556     // Gather all the live in physical registers.
02557     SmallVector<SDValue, 6> LiveGPRs;
02558     SmallVector<SDValue, 8> LiveXMMRegs;
02559     SDValue ALVal;
02560     for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
02561       unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
02562       LiveGPRs.push_back(
02563           DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
02564     }
02565     if (!ArgXMMs.empty()) {
02566       unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
02567       ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
02568       for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
02569         unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
02570         LiveXMMRegs.push_back(
02571             DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
02572       }
02573     }
02574 
02575     // Store them to the va_list returned by va_start.
02576     if (MFI->hasVAStart()) {
02577       if (IsWin64) {
02578         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
02579         // Get to the caller-allocated home save location.  Add 8 to account
02580         // for the return address.
02581         int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
02582         FuncInfo->setRegSaveFrameIndex(
02583           MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
02584         // Fixup to set vararg frame on shadow area (4 x i64).
02585         if (NumIntRegs < 4)
02586           FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
02587       } else {
02588         // For X86-64, if there are vararg parameters that are passed via
02589         // registers, then we must store them to their spots on the stack so
02590         // they may be loaded by deferencing the result of va_next.
02591         FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
02592         FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
02593         FuncInfo->setRegSaveFrameIndex(MFI->CreateStackObject(
02594             ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
02595       }
02596 
02597       // Store the integer parameter registers.
02598       SmallVector<SDValue, 8> MemOps;
02599       SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
02600                                         getPointerTy());
02601       unsigned Offset = FuncInfo->getVarArgsGPOffset();
02602       for (SDValue Val : LiveGPRs) {
02603         SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN,
02604                                   DAG.getIntPtrConstant(Offset));
02605         SDValue Store =
02606           DAG.getStore(Val.getValue(1), dl, Val, FIN,
02607                        MachinePointerInfo::getFixedStack(
02608                          FuncInfo->getRegSaveFrameIndex(), Offset),
02609                        false, false, 0);
02610         MemOps.push_back(Store);
02611         Offset += 8;
02612       }
02613 
02614       if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
02615         // Now store the XMM (fp + vector) parameter registers.
02616         SmallVector<SDValue, 12> SaveXMMOps;
02617         SaveXMMOps.push_back(Chain);
02618         SaveXMMOps.push_back(ALVal);
02619         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02620                                FuncInfo->getRegSaveFrameIndex()));
02621         SaveXMMOps.push_back(DAG.getIntPtrConstant(
02622                                FuncInfo->getVarArgsFPOffset()));
02623         SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
02624                           LiveXMMRegs.end());
02625         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
02626                                      MVT::Other, SaveXMMOps));
02627       }
02628 
02629       if (!MemOps.empty())
02630         Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
02631     } else {
02632       // Add all GPRs, al, and XMMs to the list of forwards.  We will add then
02633       // to the liveout set on a musttail call.
02634       assert(MFI->hasMustTailInVarArgFunc());
02635       auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
02636       typedef X86MachineFunctionInfo::Forward Forward;
02637 
02638       for (unsigned I = 0, E = LiveGPRs.size(); I != E; ++I) {
02639         unsigned VReg =
02640             MF.getRegInfo().createVirtualRegister(&X86::GR64RegClass);
02641         Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveGPRs[I]);
02642         Forwards.push_back(Forward(VReg, ArgGPRs[NumIntRegs + I], MVT::i64));
02643       }
02644 
02645       if (!ArgXMMs.empty()) {
02646         unsigned ALVReg =
02647             MF.getRegInfo().createVirtualRegister(&X86::GR8RegClass);
02648         Chain = DAG.getCopyToReg(Chain, dl, ALVReg, ALVal);
02649         Forwards.push_back(Forward(ALVReg, X86::AL, MVT::i8));
02650 
02651         for (unsigned I = 0, E = LiveXMMRegs.size(); I != E; ++I) {
02652           unsigned VReg =
02653               MF.getRegInfo().createVirtualRegister(&X86::VR128RegClass);
02654           Chain = DAG.getCopyToReg(Chain, dl, VReg, LiveXMMRegs[I]);
02655           Forwards.push_back(
02656               Forward(VReg, ArgXMMs[NumXMMRegs + I], MVT::v4f32));
02657         }
02658       }
02659     }
02660   }
02661 
02662   // Some CCs need callee pop.
02663   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
02664                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
02665     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
02666   } else {
02667     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
02668     // If this is an sret function, the return should pop the hidden pointer.
02669     if (!Is64Bit && !IsTailCallConvention(CallConv) &&
02670         !Subtarget->getTargetTriple().isOSMSVCRT() &&
02671         argsAreStructReturn(Ins) == StackStructReturn)
02672       FuncInfo->setBytesToPopOnReturn(4);
02673   }
02674 
02675   if (!Is64Bit) {
02676     // RegSaveFrameIndex is X86-64 only.
02677     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
02678     if (CallConv == CallingConv::X86_FastCall ||
02679         CallConv == CallingConv::X86_ThisCall)
02680       // fastcc functions can't have varargs.
02681       FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
02682   }
02683 
02684   FuncInfo->setArgumentStackSize(StackSize);
02685 
02686   return Chain;
02687 }
02688 
02689 SDValue
02690 X86TargetLowering::LowerMemOpCallTo(SDValue Chain,
02691                                     SDValue StackPtr, SDValue Arg,
02692                                     SDLoc dl, SelectionDAG &DAG,
02693                                     const CCValAssign &VA,
02694                                     ISD::ArgFlagsTy Flags) const {
02695   unsigned LocMemOffset = VA.getLocMemOffset();
02696   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
02697   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
02698   if (Flags.isByVal())
02699     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
02700 
02701   return DAG.getStore(Chain, dl, Arg, PtrOff,
02702                       MachinePointerInfo::getStack(LocMemOffset),
02703                       false, false, 0);
02704 }
02705 
02706 /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call
02707 /// optimization is performed and it is required.
02708 SDValue
02709 X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
02710                                            SDValue &OutRetAddr, SDValue Chain,
02711                                            bool IsTailCall, bool Is64Bit,
02712                                            int FPDiff, SDLoc dl) const {
02713   // Adjust the Return address stack slot.
02714   EVT VT = getPointerTy();
02715   OutRetAddr = getReturnAddressFrameIndex(DAG);
02716 
02717   // Load the "old" Return address.
02718   OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(),
02719                            false, false, false, 0);
02720   return SDValue(OutRetAddr.getNode(), 1);
02721 }
02722 
02723 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
02724 /// optimization is performed and it is required (FPDiff!=0).
02725 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
02726                                         SDValue Chain, SDValue RetAddrFrIdx,
02727                                         EVT PtrVT, unsigned SlotSize,
02728                                         int FPDiff, SDLoc dl) {
02729   // Store the return address to the appropriate stack slot.
02730   if (!FPDiff) return Chain;
02731   // Calculate the new stack slot for the return address.
02732   int NewReturnAddrFI =
02733     MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
02734                                          false);
02735   SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
02736   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
02737                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
02738                        false, false, 0);
02739   return Chain;
02740 }
02741 
02742 SDValue
02743 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
02744                              SmallVectorImpl<SDValue> &InVals) const {
02745   SelectionDAG &DAG                     = CLI.DAG;
02746   SDLoc &dl                             = CLI.DL;
02747   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
02748   SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
02749   SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
02750   SDValue Chain                         = CLI.Chain;
02751   SDValue Callee                        = CLI.Callee;
02752   CallingConv::ID CallConv              = CLI.CallConv;
02753   bool &isTailCall                      = CLI.IsTailCall;
02754   bool isVarArg                         = CLI.IsVarArg;
02755 
02756   MachineFunction &MF = DAG.getMachineFunction();
02757   bool Is64Bit        = Subtarget->is64Bit();
02758   bool IsWin64        = Subtarget->isCallingConvWin64(CallConv);
02759   StructReturnType SR = callIsStructReturn(Outs);
02760   bool IsSibcall      = false;
02761   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
02762 
02763   if (MF.getTarget().Options.DisableTailCalls)
02764     isTailCall = false;
02765 
02766   bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
02767   if (IsMustTail) {
02768     // Force this to be a tail call.  The verifier rules are enough to ensure
02769     // that we can lower this successfully without moving the return address
02770     // around.
02771     isTailCall = true;
02772   } else if (isTailCall) {
02773     // Check if it's really possible to do a tail call.
02774     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
02775                     isVarArg, SR != NotStructReturn,
02776                     MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
02777                     Outs, OutVals, Ins, DAG);
02778 
02779     // Sibcalls are automatically detected tailcalls which do not require
02780     // ABI changes.
02781     if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
02782       IsSibcall = true;
02783 
02784     if (isTailCall)
02785       ++NumTailCalls;
02786   }
02787 
02788   assert(!(isVarArg && IsTailCallConvention(CallConv)) &&
02789          "Var args not supported with calling convention fastcc, ghc or hipe");
02790 
02791   // Analyze operands of the call, assigning locations to each operand.
02792   SmallVector<CCValAssign, 16> ArgLocs;
02793   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
02794 
02795   // Allocate shadow area for Win64
02796   if (IsWin64)
02797     CCInfo.AllocateStack(32, 8);
02798 
02799   CCInfo.AnalyzeCallOperands(Outs, CC_X86);
02800 
02801   // Get a count of how many bytes are to be pushed on the stack.
02802   unsigned NumBytes = CCInfo.getNextStackOffset();
02803   if (IsSibcall)
02804     // This is a sibcall. The memory operands are available in caller's
02805     // own caller's stack.
02806     NumBytes = 0;
02807   else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
02808            IsTailCallConvention(CallConv))
02809     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
02810 
02811   int FPDiff = 0;
02812   if (isTailCall && !IsSibcall && !IsMustTail) {
02813     // Lower arguments at fp - stackoffset + fpdiff.
02814     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
02815 
02816     FPDiff = NumBytesCallerPushed - NumBytes;
02817 
02818     // Set the delta of movement of the returnaddr stackslot.
02819     // But only set if delta is greater than previous delta.
02820     if (FPDiff < X86Info->getTCReturnAddrDelta())
02821       X86Info->setTCReturnAddrDelta(FPDiff);
02822   }
02823 
02824   unsigned NumBytesToPush = NumBytes;
02825   unsigned NumBytesToPop = NumBytes;
02826 
02827   // If we have an inalloca argument, all stack space has already been allocated
02828   // for us and be right at the top of the stack.  We don't support multiple
02829   // arguments passed in memory when using inalloca.
02830   if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
02831     NumBytesToPush = 0;
02832     if (!ArgLocs.back().isMemLoc())
02833       report_fatal_error("cannot use inalloca attribute on a register "
02834                          "parameter");
02835     if (ArgLocs.back().getLocMemOffset() != 0)
02836       report_fatal_error("any parameter with the inalloca attribute must be "
02837                          "the only memory argument");
02838   }
02839 
02840   if (!IsSibcall)
02841     Chain = DAG.getCALLSEQ_START(
02842         Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl);
02843 
02844   SDValue RetAddrFrIdx;
02845   // Load return address for tail calls.
02846   if (isTailCall && FPDiff)
02847     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
02848                                     Is64Bit, FPDiff, dl);
02849 
02850   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
02851   SmallVector<SDValue, 8> MemOpChains;
02852   SDValue StackPtr;
02853 
02854   // Walk the register/memloc assignments, inserting copies/loads.  In the case
02855   // of tail call optimization arguments are handle later.
02856   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
02857       DAG.getSubtarget().getRegisterInfo());
02858   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
02859     // Skip inalloca arguments, they have already been written.
02860     ISD::ArgFlagsTy Flags = Outs[i].Flags;
02861     if (Flags.isInAlloca())
02862       continue;
02863 
02864     CCValAssign &VA = ArgLocs[i];
02865     EVT RegVT = VA.getLocVT();
02866     SDValue Arg = OutVals[i];
02867     bool isByVal = Flags.isByVal();
02868 
02869     // Promote the value if needed.
02870     switch (VA.getLocInfo()) {
02871     default: llvm_unreachable("Unknown loc info!");
02872     case CCValAssign::Full: break;
02873     case CCValAssign::SExt:
02874       Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
02875       break;
02876     case CCValAssign::ZExt:
02877       Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
02878       break;
02879     case CCValAssign::AExt:
02880       if (RegVT.is128BitVector()) {
02881         // Special case: passing MMX values in XMM registers.
02882         Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg);
02883         Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
02884         Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
02885       } else
02886         Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
02887       break;
02888     case CCValAssign::BCvt:
02889       Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg);
02890       break;
02891     case CCValAssign::Indirect: {
02892       // Store the argument.
02893       SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
02894       int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
02895       Chain = DAG.getStore(Chain, dl, Arg, SpillSlot,
02896                            MachinePointerInfo::getFixedStack(FI),
02897                            false, false, 0);
02898       Arg = SpillSlot;
02899       break;
02900     }
02901     }
02902 
02903     if (VA.isRegLoc()) {
02904       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
02905       if (isVarArg && IsWin64) {
02906         // Win64 ABI requires argument XMM reg to be copied to the corresponding
02907         // shadow reg if callee is a varargs function.
02908         unsigned ShadowReg = 0;
02909         switch (VA.getLocReg()) {
02910         case X86::XMM0: ShadowReg = X86::RCX; break;
02911         case X86::XMM1: ShadowReg = X86::RDX; break;
02912         case X86::XMM2: ShadowReg = X86::R8; break;
02913         case X86::XMM3: ShadowReg = X86::R9; break;
02914         }
02915         if (ShadowReg)
02916           RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
02917       }
02918     } else if (!IsSibcall && (!isTailCall || isByVal)) {
02919       assert(VA.isMemLoc());
02920       if (!StackPtr.getNode())
02921         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
02922                                       getPointerTy());
02923       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
02924                                              dl, DAG, VA, Flags));
02925     }
02926   }
02927 
02928   if (!MemOpChains.empty())
02929     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
02930 
02931   if (Subtarget->isPICStyleGOT()) {
02932     // ELF / PIC requires GOT in the EBX register before function calls via PLT
02933     // GOT pointer.
02934     if (!isTailCall) {
02935       RegsToPass.push_back(std::make_pair(unsigned(X86::EBX),
02936                DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy())));
02937     } else {
02938       // If we are tail calling and generating PIC/GOT style code load the
02939       // address of the callee into ECX. The value in ecx is used as target of
02940       // the tail jump. This is done to circumvent the ebx/callee-saved problem
02941       // for tail calls on PIC/GOT architectures. Normally we would just put the
02942       // address of GOT into ebx and then call target@PLT. But for tail calls
02943       // ebx would be restored (since ebx is callee saved) before jumping to the
02944       // target@PLT.
02945 
02946       // Note: The actual moving to ECX is done further down.
02947       GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
02948       if (G && !G->getGlobal()->hasHiddenVisibility() &&
02949           !G->getGlobal()->hasProtectedVisibility())
02950         Callee = LowerGlobalAddress(Callee, DAG);
02951       else if (isa<ExternalSymbolSDNode>(Callee))
02952         Callee = LowerExternalSymbol(Callee, DAG);
02953     }
02954   }
02955 
02956   if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
02957     // From AMD64 ABI document:
02958     // For calls that may call functions that use varargs or stdargs
02959     // (prototype-less calls or calls to functions containing ellipsis (...) in
02960     // the declaration) %al is used as hidden argument to specify the number
02961     // of SSE registers used. The contents of %al do not need to match exactly
02962     // the number of registers, but must be an ubound on the number of SSE
02963     // registers used and is in the range 0 - 8 inclusive.
02964 
02965     // Count the number of XMM registers allocated.
02966     static const MCPhysReg XMMArgRegs[] = {
02967       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
02968       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
02969     };
02970     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
02971     assert((Subtarget->hasSSE1() || !NumXMMRegs)
02972            && "SSE registers cannot be used when SSE is disabled");
02973 
02974     RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
02975                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
02976   }
02977 
02978   if (Is64Bit && isVarArg && IsMustTail) {
02979     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
02980     for (const auto &F : Forwards) {
02981       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
02982       RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
02983     }
02984   }
02985 
02986   // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
02987   // don't need this because the eligibility check rejects calls that require
02988   // shuffling arguments passed in memory.
02989   if (!IsSibcall && isTailCall) {
02990     // Force all the incoming stack arguments to be loaded from the stack
02991     // before any new outgoing arguments are stored to the stack, because the
02992     // outgoing stack slots may alias the incoming argument stack slots, and
02993     // the alias isn't otherwise explicit. This is slightly more conservative
02994     // than necessary, because it means that each store effectively depends
02995     // on every argument instead of just those arguments it would clobber.
02996     SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
02997 
02998     SmallVector<SDValue, 8> MemOpChains2;
02999     SDValue FIN;
03000     int FI = 0;
03001     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03002       CCValAssign &VA = ArgLocs[i];
03003       if (VA.isRegLoc())
03004         continue;
03005       assert(VA.isMemLoc());
03006       SDValue Arg = OutVals[i];
03007       ISD::ArgFlagsTy Flags = Outs[i].Flags;
03008       // Skip inalloca arguments.  They don't require any work.
03009       if (Flags.isInAlloca())
03010         continue;
03011       // Create frame index.
03012       int32_t Offset = VA.getLocMemOffset()+FPDiff;
03013       uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
03014       FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
03015       FIN = DAG.getFrameIndex(FI, getPointerTy());
03016 
03017       if (Flags.isByVal()) {
03018         // Copy relative to framepointer.
03019         SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
03020         if (!StackPtr.getNode())
03021           StackPtr = DAG.getCopyFromReg(Chain, dl,
03022                                         RegInfo->getStackRegister(),
03023                                         getPointerTy());
03024         Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
03025 
03026         MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
03027                                                          ArgChain,
03028                                                          Flags, DAG, dl));
03029       } else {
03030         // Store relative to framepointer.
03031         MemOpChains2.push_back(
03032           DAG.getStore(ArgChain, dl, Arg, FIN,
03033                        MachinePointerInfo::getFixedStack(FI),
03034                        false, false, 0));
03035       }
03036     }
03037 
03038     if (!MemOpChains2.empty())
03039       Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
03040 
03041     // Store the return address to the appropriate stack slot.
03042     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
03043                                      getPointerTy(), RegInfo->getSlotSize(),
03044                                      FPDiff, dl);
03045   }
03046 
03047   // Build a sequence of copy-to-reg nodes chained together with token chain
03048   // and flag operands which copy the outgoing args into registers.
03049   SDValue InFlag;
03050   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
03051     Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
03052                              RegsToPass[i].second, InFlag);
03053     InFlag = Chain.getValue(1);
03054   }
03055 
03056   if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
03057     assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
03058     // In the 64-bit large code model, we have to make all calls
03059     // through a register, since the call instruction's 32-bit
03060     // pc-relative offset may not be large enough to hold the whole
03061     // address.
03062   } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
03063     // If the callee is a GlobalAddress node (quite common, every direct call
03064     // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
03065     // it.
03066 
03067     // We should use extra load for direct calls to dllimported functions in
03068     // non-JIT mode.
03069     const GlobalValue *GV = G->getGlobal();
03070     if (!GV->hasDLLImportStorageClass()) {
03071       unsigned char OpFlags = 0;
03072       bool ExtraLoad = false;
03073       unsigned WrapperKind = ISD::DELETED_NODE;
03074 
03075       // On ELF targets, in both X86-64 and X86-32 mode, direct calls to
03076       // external symbols most go through the PLT in PIC mode.  If the symbol
03077       // has hidden or protected visibility, or if it is static or local, then
03078       // we don't need to use the PLT - we can directly call it.
03079       if (Subtarget->isTargetELF() &&
03080           DAG.getTarget().getRelocationModel() == Reloc::PIC_ &&
03081           GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) {
03082         OpFlags = X86II::MO_PLT;
03083       } else if (Subtarget->isPICStyleStubAny() &&
03084                  (GV->isDeclaration() || GV->isWeakForLinker()) &&
03085                  (!Subtarget->getTargetTriple().isMacOSX() ||
03086                   Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03087         // PC-relative references to external symbols should go through $stub,
03088         // unless we're building with the leopard linker or later, which
03089         // automatically synthesizes these stubs.
03090         OpFlags = X86II::MO_DARWIN_STUB;
03091       } else if (Subtarget->isPICStyleRIPRel() &&
03092                  isa<Function>(GV) &&
03093                  cast<Function>(GV)->getAttributes().
03094                    hasAttribute(AttributeSet::FunctionIndex,
03095                                 Attribute::NonLazyBind)) {
03096         // If the function is marked as non-lazy, generate an indirect call
03097         // which loads from the GOT directly. This avoids runtime overhead
03098         // at the cost of eager binding (and one extra byte of encoding).
03099         OpFlags = X86II::MO_GOTPCREL;
03100         WrapperKind = X86ISD::WrapperRIP;
03101         ExtraLoad = true;
03102       }
03103 
03104       Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(),
03105                                           G->getOffset(), OpFlags);
03106 
03107       // Add a wrapper if needed.
03108       if (WrapperKind != ISD::DELETED_NODE)
03109         Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee);
03110       // Add extra indirection if needed.
03111       if (ExtraLoad)
03112         Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee,
03113                              MachinePointerInfo::getGOT(),
03114                              false, false, false, 0);
03115     }
03116   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
03117     unsigned char OpFlags = 0;
03118 
03119     // On ELF targets, in either X86-64 or X86-32 mode, direct calls to
03120     // external symbols should go through the PLT.
03121     if (Subtarget->isTargetELF() &&
03122         DAG.getTarget().getRelocationModel() == Reloc::PIC_) {
03123       OpFlags = X86II::MO_PLT;
03124     } else if (Subtarget->isPICStyleStubAny() &&
03125                (!Subtarget->getTargetTriple().isMacOSX() ||
03126                 Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) {
03127       // PC-relative references to external symbols should go through $stub,
03128       // unless we're building with the leopard linker or later, which
03129       // automatically synthesizes these stubs.
03130       OpFlags = X86II::MO_DARWIN_STUB;
03131     }
03132 
03133     Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(),
03134                                          OpFlags);
03135   } else if (Subtarget->isTarget64BitILP32() && Callee->getValueType(0) == MVT::i32) {
03136     // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
03137     Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
03138   }
03139 
03140   // Returns a chain & a flag for retval copy to use.
03141   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
03142   SmallVector<SDValue, 8> Ops;
03143 
03144   if (!IsSibcall && isTailCall) {
03145     Chain = DAG.getCALLSEQ_END(Chain,
03146                                DAG.getIntPtrConstant(NumBytesToPop, true),
03147                                DAG.getIntPtrConstant(0, true), InFlag, dl);
03148     InFlag = Chain.getValue(1);
03149   }
03150 
03151   Ops.push_back(Chain);
03152   Ops.push_back(Callee);
03153 
03154   if (isTailCall)
03155     Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
03156 
03157   // Add argument registers to the end of the list so that they are known live
03158   // into the call.
03159   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
03160     Ops.push_back(DAG.getRegister(RegsToPass[i].first,
03161                                   RegsToPass[i].second.getValueType()));
03162 
03163   // Add a register mask operand representing the call-preserved registers.
03164   const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
03165   const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
03166   assert(Mask && "Missing call preserved mask for calling convention");
03167   Ops.push_back(DAG.getRegisterMask(Mask));
03168 
03169   if (InFlag.getNode())
03170     Ops.push_back(InFlag);
03171 
03172   if (isTailCall) {
03173     // We used to do:
03174     //// If this is the first return lowered for this function, add the regs
03175     //// to the liveout set for the function.
03176     // This isn't right, although it's probably harmless on x86; liveouts
03177     // should be computed from returns not tail calls.  Consider a void
03178     // function making a tail call to a function returning int.
03179     return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
03180   }
03181 
03182   Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
03183   InFlag = Chain.getValue(1);
03184 
03185   // Create the CALLSEQ_END node.
03186   unsigned NumBytesForCalleeToPop;
03187   if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
03188                        DAG.getTarget().Options.GuaranteedTailCallOpt))
03189     NumBytesForCalleeToPop = NumBytes;    // Callee pops everything
03190   else if (!Is64Bit && !IsTailCallConvention(CallConv) &&
03191            !Subtarget->getTargetTriple().isOSMSVCRT() &&
03192            SR == StackStructReturn)
03193     // If this is a call to a struct-return function, the callee
03194     // pops the hidden struct pointer, so we have to push it back.
03195     // This is common for Darwin/X86, Linux & Mingw32 targets.
03196     // For MSVC Win32 targets, the caller pops the hidden struct pointer.
03197     NumBytesForCalleeToPop = 4;
03198   else
03199     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
03200 
03201   // Returns a flag for retval copy to use.
03202   if (!IsSibcall) {
03203     Chain = DAG.getCALLSEQ_END(Chain,
03204                                DAG.getIntPtrConstant(NumBytesToPop, true),
03205                                DAG.getIntPtrConstant(NumBytesForCalleeToPop,
03206                                                      true),
03207                                InFlag, dl);
03208     InFlag = Chain.getValue(1);
03209   }
03210 
03211   // Handle result values, copying them out of physregs into vregs that we
03212   // return.
03213   return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
03214                          Ins, dl, DAG, InVals);
03215 }
03216 
03217 //===----------------------------------------------------------------------===//
03218 //                Fast Calling Convention (tail call) implementation
03219 //===----------------------------------------------------------------------===//
03220 
03221 //  Like std call, callee cleans arguments, convention except that ECX is
03222 //  reserved for storing the tail called function address. Only 2 registers are
03223 //  free for argument passing (inreg). Tail call optimization is performed
03224 //  provided:
03225 //                * tailcallopt is enabled
03226 //                * caller/callee are fastcc
03227 //  On X86_64 architecture with GOT-style position independent code only local
03228 //  (within module) calls are supported at the moment.
03229 //  To keep the stack aligned according to platform abi the function
03230 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
03231 //  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
03232 //  If a tail called function callee has more arguments than the caller the
03233 //  caller needs to make sure that there is room to move the RETADDR to. This is
03234 //  achieved by reserving an area the size of the argument delta right after the
03235 //  original RETADDR, but before the saved framepointer or the spilled registers
03236 //  e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
03237 //  stack layout:
03238 //    arg1
03239 //    arg2
03240 //    RETADDR
03241 //    [ new RETADDR
03242 //      move area ]
03243 //    (possible EBP)
03244 //    ESI
03245 //    EDI
03246 //    local1 ..
03247 
03248 /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
03249 /// for a 16 byte align requirement.
03250 unsigned
03251 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
03252                                                SelectionDAG& DAG) const {
03253   MachineFunction &MF = DAG.getMachineFunction();
03254   const TargetMachine &TM = MF.getTarget();
03255   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03256       TM.getSubtargetImpl()->getRegisterInfo());
03257   const TargetFrameLowering &TFI = *TM.getSubtargetImpl()->getFrameLowering();
03258   unsigned StackAlignment = TFI.getStackAlignment();
03259   uint64_t AlignMask = StackAlignment - 1;
03260   int64_t Offset = StackSize;
03261   unsigned SlotSize = RegInfo->getSlotSize();
03262   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
03263     // Number smaller than 12 so just add the difference.
03264     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
03265   } else {
03266     // Mask out lower bits, add stackalignment once plus the 12 bytes.
03267     Offset = ((~AlignMask) & Offset) + StackAlignment +
03268       (StackAlignment-SlotSize);
03269   }
03270   return Offset;
03271 }
03272 
03273 /// MatchingStackOffset - Return true if the given stack call argument is
03274 /// already available in the same position (relatively) of the caller's
03275 /// incoming argument stack.
03276 static
03277 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
03278                          MachineFrameInfo *MFI, const MachineRegisterInfo *MRI,
03279                          const X86InstrInfo *TII) {
03280   unsigned Bytes = Arg.getValueType().getSizeInBits() / 8;
03281   int FI = INT_MAX;
03282   if (Arg.getOpcode() == ISD::CopyFromReg) {
03283     unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
03284     if (!TargetRegisterInfo::isVirtualRegister(VR))
03285       return false;
03286     MachineInstr *Def = MRI->getVRegDef(VR);
03287     if (!Def)
03288       return false;
03289     if (!Flags.isByVal()) {
03290       if (!TII->isLoadFromStackSlot(Def, FI))
03291         return false;
03292     } else {
03293       unsigned Opcode = Def->getOpcode();
03294       if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) &&
03295           Def->getOperand(1).isFI()) {
03296         FI = Def->getOperand(1).getIndex();
03297         Bytes = Flags.getByValSize();
03298       } else
03299         return false;
03300     }
03301   } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
03302     if (Flags.isByVal())
03303       // ByVal argument is passed in as a pointer but it's now being
03304       // dereferenced. e.g.
03305       // define @foo(%struct.X* %A) {
03306       //   tail call @bar(%struct.X* byval %A)
03307       // }
03308       return false;
03309     SDValue Ptr = Ld->getBasePtr();
03310     FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
03311     if (!FINode)
03312       return false;
03313     FI = FINode->getIndex();
03314   } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
03315     FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
03316     FI = FINode->getIndex();
03317     Bytes = Flags.getByValSize();
03318   } else
03319     return false;
03320 
03321   assert(FI != INT_MAX);
03322   if (!MFI->isFixedObjectIndex(FI))
03323     return false;
03324   return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI);
03325 }
03326 
03327 /// IsEligibleForTailCallOptimization - Check whether the call is eligible
03328 /// for tail call optimization. Targets which want to do tail call
03329 /// optimization should implement this function.
03330 bool
03331 X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
03332                                                      CallingConv::ID CalleeCC,
03333                                                      bool isVarArg,
03334                                                      bool isCalleeStructRet,
03335                                                      bool isCallerStructRet,
03336                                                      Type *RetTy,
03337                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
03338                                     const SmallVectorImpl<SDValue> &OutVals,
03339                                     const SmallVectorImpl<ISD::InputArg> &Ins,
03340                                                      SelectionDAG &DAG) const {
03341   if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC))
03342     return false;
03343 
03344   // If -tailcallopt is specified, make fastcc functions tail-callable.
03345   const MachineFunction &MF = DAG.getMachineFunction();
03346   const Function *CallerF = MF.getFunction();
03347 
03348   // If the function return type is x86_fp80 and the callee return type is not,
03349   // then the FP_EXTEND of the call result is not a nop. It's not safe to
03350   // perform a tailcall optimization here.
03351   if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
03352     return false;
03353 
03354   CallingConv::ID CallerCC = CallerF->getCallingConv();
03355   bool CCMatch = CallerCC == CalleeCC;
03356   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
03357   bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC);
03358 
03359   if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
03360     if (IsTailCallConvention(CalleeCC) && CCMatch)
03361       return true;
03362     return false;
03363   }
03364 
03365   // Look for obvious safe cases to perform tail call optimization that do not
03366   // require ABI changes. This is what gcc calls sibcall.
03367 
03368   // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
03369   // emit a special epilogue.
03370   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03371       DAG.getSubtarget().getRegisterInfo());
03372   if (RegInfo->needsStackRealignment(MF))
03373     return false;
03374 
03375   // Also avoid sibcall optimization if either caller or callee uses struct
03376   // return semantics.
03377   if (isCalleeStructRet || isCallerStructRet)
03378     return false;
03379 
03380   // An stdcall/thiscall caller is expected to clean up its arguments; the
03381   // callee isn't going to do that.
03382   // FIXME: this is more restrictive than needed. We could produce a tailcall
03383   // when the stack adjustment matches. For example, with a thiscall that takes
03384   // only one argument.
03385   if (!CCMatch && (CallerCC == CallingConv::X86_StdCall ||
03386                    CallerCC == CallingConv::X86_ThisCall))
03387     return false;
03388 
03389   // Do not sibcall optimize vararg calls unless all arguments are passed via
03390   // registers.
03391   if (isVarArg && !Outs.empty()) {
03392 
03393     // Optimizing for varargs on Win64 is unlikely to be safe without
03394     // additional testing.
03395     if (IsCalleeWin64 || IsCallerWin64)
03396       return false;
03397 
03398     SmallVector<CCValAssign, 16> ArgLocs;
03399     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03400                    *DAG.getContext());
03401 
03402     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03403     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
03404       if (!ArgLocs[i].isRegLoc())
03405         return false;
03406   }
03407 
03408   // If the call result is in ST0 / ST1, it needs to be popped off the x87
03409   // stack.  Therefore, if it's not used by the call it is not safe to optimize
03410   // this into a sibcall.
03411   bool Unused = false;
03412   for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
03413     if (!Ins[i].Used) {
03414       Unused = true;
03415       break;
03416     }
03417   }
03418   if (Unused) {
03419     SmallVector<CCValAssign, 16> RVLocs;
03420     CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), RVLocs,
03421                    *DAG.getContext());
03422     CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
03423     for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
03424       CCValAssign &VA = RVLocs[i];
03425       if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
03426         return false;
03427     }
03428   }
03429 
03430   // If the calling conventions do not match, then we'd better make sure the
03431   // results are returned in the same way as what the caller expects.
03432   if (!CCMatch) {
03433     SmallVector<CCValAssign, 16> RVLocs1;
03434     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), RVLocs1,
03435                     *DAG.getContext());
03436     CCInfo1.AnalyzeCallResult(Ins, RetCC_X86);
03437 
03438     SmallVector<CCValAssign, 16> RVLocs2;
03439     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), RVLocs2,
03440                     *DAG.getContext());
03441     CCInfo2.AnalyzeCallResult(Ins, RetCC_X86);
03442 
03443     if (RVLocs1.size() != RVLocs2.size())
03444       return false;
03445     for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) {
03446       if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc())
03447         return false;
03448       if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo())
03449         return false;
03450       if (RVLocs1[i].isRegLoc()) {
03451         if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg())
03452           return false;
03453       } else {
03454         if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset())
03455           return false;
03456       }
03457     }
03458   }
03459 
03460   // If the callee takes no arguments then go on to check the results of the
03461   // call.
03462   if (!Outs.empty()) {
03463     // Check if stack adjustment is needed. For now, do not do this if any
03464     // argument is passed on the stack.
03465     SmallVector<CCValAssign, 16> ArgLocs;
03466     CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), ArgLocs,
03467                    *DAG.getContext());
03468 
03469     // Allocate shadow area for Win64
03470     if (IsCalleeWin64)
03471       CCInfo.AllocateStack(32, 8);
03472 
03473     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
03474     if (CCInfo.getNextStackOffset()) {
03475       MachineFunction &MF = DAG.getMachineFunction();
03476       if (MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn())
03477         return false;
03478 
03479       // Check if the arguments are already laid out in the right way as
03480       // the caller's fixed stack objects.
03481       MachineFrameInfo *MFI = MF.getFrameInfo();
03482       const MachineRegisterInfo *MRI = &MF.getRegInfo();
03483       const X86InstrInfo *TII =
03484           static_cast<const X86InstrInfo *>(DAG.getSubtarget().getInstrInfo());
03485       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03486         CCValAssign &VA = ArgLocs[i];
03487         SDValue Arg = OutVals[i];
03488         ISD::ArgFlagsTy Flags = Outs[i].Flags;
03489         if (VA.getLocInfo() == CCValAssign::Indirect)
03490           return false;
03491         if (!VA.isRegLoc()) {
03492           if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
03493                                    MFI, MRI, TII))
03494             return false;
03495         }
03496       }
03497     }
03498 
03499     // If the tailcall address may be in a register, then make sure it's
03500     // possible to register allocate for it. In 32-bit, the call address can
03501     // only target EAX, EDX, or ECX since the tail call must be scheduled after
03502     // callee-saved registers are restored. These happen to be the same
03503     // registers used to pass 'inreg' arguments so watch out for those.
03504     if (!Subtarget->is64Bit() &&
03505         ((!isa<GlobalAddressSDNode>(Callee) &&
03506           !isa<ExternalSymbolSDNode>(Callee)) ||
03507          DAG.getTarget().getRelocationModel() == Reloc::PIC_)) {
03508       unsigned NumInRegs = 0;
03509       // In PIC we need an extra register to formulate the address computation
03510       // for the callee.
03511       unsigned MaxInRegs =
03512   (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
03513 
03514       for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
03515         CCValAssign &VA = ArgLocs[i];
03516         if (!VA.isRegLoc())
03517           continue;
03518         unsigned Reg = VA.getLocReg();
03519         switch (Reg) {
03520         default: break;
03521         case X86::EAX: case X86::EDX: case X86::ECX:
03522           if (++NumInRegs == MaxInRegs)
03523             return false;
03524           break;
03525         }
03526       }
03527     }
03528   }
03529 
03530   return true;
03531 }
03532 
03533 FastISel *
03534 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
03535                                   const TargetLibraryInfo *libInfo) const {
03536   return X86::createFastISel(funcInfo, libInfo);
03537 }
03538 
03539 //===----------------------------------------------------------------------===//
03540 //                           Other Lowering Hooks
03541 //===----------------------------------------------------------------------===//
03542 
03543 static bool MayFoldLoad(SDValue Op) {
03544   return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
03545 }
03546 
03547 static bool MayFoldIntoStore(SDValue Op) {
03548   return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
03549 }
03550 
03551 static bool isTargetShuffle(unsigned Opcode) {
03552   switch(Opcode) {
03553   default: return false;
03554   case X86ISD::BLENDI:
03555   case X86ISD::PSHUFB:
03556   case X86ISD::PSHUFD:
03557   case X86ISD::PSHUFHW:
03558   case X86ISD::PSHUFLW:
03559   case X86ISD::SHUFP:
03560   case X86ISD::PALIGNR:
03561   case X86ISD::MOVLHPS:
03562   case X86ISD::MOVLHPD:
03563   case X86ISD::MOVHLPS:
03564   case X86ISD::MOVLPS:
03565   case X86ISD::MOVLPD:
03566   case X86ISD::MOVSHDUP:
03567   case X86ISD::MOVSLDUP:
03568   case X86ISD::MOVDDUP:
03569   case X86ISD::MOVSS:
03570   case X86ISD::MOVSD:
03571   case X86ISD::UNPCKL:
03572   case X86ISD::UNPCKH:
03573   case X86ISD::VPERMILPI:
03574   case X86ISD::VPERM2X128:
03575   case X86ISD::VPERMI:
03576     return true;
03577   }
03578 }
03579 
03580 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03581                                     SDValue V1, SelectionDAG &DAG) {
03582   switch(Opc) {
03583   default: llvm_unreachable("Unknown x86 shuffle node");
03584   case X86ISD::MOVSHDUP:
03585   case X86ISD::MOVSLDUP:
03586   case X86ISD::MOVDDUP:
03587     return DAG.getNode(Opc, dl, VT, V1);
03588   }
03589 }
03590 
03591 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03592                                     SDValue V1, unsigned TargetMask,
03593                                     SelectionDAG &DAG) {
03594   switch(Opc) {
03595   default: llvm_unreachable("Unknown x86 shuffle node");
03596   case X86ISD::PSHUFD:
03597   case X86ISD::PSHUFHW:
03598   case X86ISD::PSHUFLW:
03599   case X86ISD::VPERMILPI:
03600   case X86ISD::VPERMI:
03601     return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8));
03602   }
03603 }
03604 
03605 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03606                                     SDValue V1, SDValue V2, unsigned TargetMask,
03607                                     SelectionDAG &DAG) {
03608   switch(Opc) {
03609   default: llvm_unreachable("Unknown x86 shuffle node");
03610   case X86ISD::PALIGNR:
03611   case X86ISD::VALIGN:
03612   case X86ISD::SHUFP:
03613   case X86ISD::VPERM2X128:
03614     return DAG.getNode(Opc, dl, VT, V1, V2,
03615                        DAG.getConstant(TargetMask, MVT::i8));
03616   }
03617 }
03618 
03619 static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT,
03620                                     SDValue V1, SDValue V2, SelectionDAG &DAG) {
03621   switch(Opc) {
03622   default: llvm_unreachable("Unknown x86 shuffle node");
03623   case X86ISD::MOVLHPS:
03624   case X86ISD::MOVLHPD:
03625   case X86ISD::MOVHLPS:
03626   case X86ISD::MOVLPS:
03627   case X86ISD::MOVLPD:
03628   case X86ISD::MOVSS:
03629   case X86ISD::MOVSD:
03630   case X86ISD::UNPCKL:
03631   case X86ISD::UNPCKH:
03632     return DAG.getNode(Opc, dl, VT, V1, V2);
03633   }
03634 }
03635 
03636 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
03637   MachineFunction &MF = DAG.getMachineFunction();
03638   const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
03639       DAG.getSubtarget().getRegisterInfo());
03640   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
03641   int ReturnAddrIndex = FuncInfo->getRAIndex();
03642 
03643   if (ReturnAddrIndex == 0) {
03644     // Set up a frame object for the return address.
03645     unsigned SlotSize = RegInfo->getSlotSize();
03646     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize,
03647                                                            -(int64_t)SlotSize,
03648                                                            false);
03649     FuncInfo->setRAIndex(ReturnAddrIndex);
03650   }
03651 
03652   return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
03653 }
03654 
03655 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
03656                                        bool hasSymbolicDisplacement) {
03657   // Offset should fit into 32 bit immediate field.
03658   if (!isInt<32>(Offset))
03659     return false;
03660 
03661   // If we don't have a symbolic displacement - we don't have any extra
03662   // restrictions.
03663   if (!hasSymbolicDisplacement)
03664     return true;
03665 
03666   // FIXME: Some tweaks might be needed for medium code model.
03667   if (M != CodeModel::Small && M != CodeModel::Kernel)
03668     return false;
03669 
03670   // For small code model we assume that latest object is 16MB before end of 31
03671   // bits boundary. We may also accept pretty large negative constants knowing
03672   // that all objects are in the positive half of address space.
03673   if (M == CodeModel::Small && Offset < 16*1024*1024)
03674     return true;
03675 
03676   // For kernel code model we know that all object resist in the negative half
03677   // of 32bits address space. We may not accept negative offsets, since they may
03678   // be just off and we may accept pretty large positive ones.
03679   if (M == CodeModel::Kernel && Offset > 0)
03680     return true;
03681 
03682   return false;
03683 }
03684 
03685 /// isCalleePop - Determines whether the callee is required to pop its
03686 /// own arguments. Callee pop is necessary to support tail calls.
03687 bool X86::isCalleePop(CallingConv::ID CallingConv,
03688                       bool is64Bit, bool IsVarArg, bool TailCallOpt) {
03689   switch (CallingConv) {
03690   default:
03691     return false;
03692   case CallingConv::X86_StdCall:
03693   case CallingConv::X86_FastCall:
03694   case CallingConv::X86_ThisCall:
03695     return !is64Bit;
03696   case CallingConv::Fast:
03697   case CallingConv::GHC:
03698   case CallingConv::HiPE:
03699     if (IsVarArg)
03700       return false;
03701     return TailCallOpt;
03702   }
03703 }
03704 
03705 /// \brief Return true if the condition is an unsigned comparison operation.
03706 static bool isX86CCUnsigned(unsigned X86CC) {
03707   switch (X86CC) {
03708   default: llvm_unreachable("Invalid integer condition!");
03709   case X86::COND_E:     return true;
03710   case X86::COND_G:     return false;
03711   case X86::COND_GE:    return false;
03712   case X86::COND_L:     return false;
03713   case X86::COND_LE:    return false;
03714   case X86::COND_NE:    return true;
03715   case X86::COND_B:     return true;
03716   case X86::COND_A:     return true;
03717   case X86::COND_BE:    return true;
03718   case X86::COND_AE:    return true;
03719   }
03720   llvm_unreachable("covered switch fell through?!");
03721 }
03722 
03723 /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86
03724 /// specific condition code, returning the condition code and the LHS/RHS of the
03725 /// comparison to make.
03726 static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP,
03727                                SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) {
03728   if (!isFP) {
03729     if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
03730       if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
03731         // X > -1   -> X == 0, jump !sign.
03732         RHS = DAG.getConstant(0, RHS.getValueType());
03733         return X86::COND_NS;
03734       }
03735       if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
03736         // X < 0   -> X == 0, jump on sign.
03737         return X86::COND_S;
03738       }
03739       if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
03740         // X < 1   -> X <= 0
03741         RHS = DAG.getConstant(0, RHS.getValueType());
03742         return X86::COND_LE;
03743       }
03744     }
03745 
03746     switch (SetCCOpcode) {
03747     default: llvm_unreachable("Invalid integer condition!");
03748     case ISD::SETEQ:  return X86::COND_E;
03749     case ISD::SETGT:  return X86::COND_G;
03750     case ISD::SETGE:  return X86::COND_GE;
03751     case ISD::SETLT:  return X86::COND_L;
03752     case ISD::SETLE:  return X86::COND_LE;
03753     case ISD::SETNE:  return X86::COND_NE;
03754     case ISD::SETULT: return X86::COND_B;
03755     case ISD::SETUGT: return X86::COND_A;
03756     case ISD::SETULE: return X86::COND_BE;
03757     case ISD::SETUGE: return X86::COND_AE;
03758     }
03759   }
03760 
03761   // First determine if it is required or is profitable to flip the operands.
03762 
03763   // If LHS is a foldable load, but RHS is not, flip the condition.
03764   if (ISD::isNON_EXTLoad(LHS.getNode()) &&
03765       !ISD::isNON_EXTLoad(RHS.getNode())) {
03766     SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
03767     std::swap(LHS, RHS);
03768   }
03769 
03770   switch (SetCCOpcode) {
03771   default: break;
03772   case ISD::SETOLT:
03773   case ISD::SETOLE:
03774   case ISD::SETUGT:
03775   case ISD::SETUGE:
03776     std::swap(LHS, RHS);
03777     break;
03778   }
03779 
03780   // On a floating point condition, the flags are set as follows:
03781   // ZF  PF  CF   op
03782   //  0 | 0 | 0 | X > Y
03783   //  0 | 0 | 1 | X < Y
03784   //  1 | 0 | 0 | X == Y
03785   //  1 | 1 | 1 | unordered
03786   switch (SetCCOpcode) {
03787   default: llvm_unreachable("Condcode should be pre-legalized away");
03788   case ISD::SETUEQ:
03789   case ISD::SETEQ:   return X86::COND_E;
03790   case ISD::SETOLT:              // flipped
03791   case ISD::SETOGT:
03792   case ISD::SETGT:   return X86::COND_A;
03793   case ISD::SETOLE:              // flipped
03794   case ISD::SETOGE:
03795   case ISD::SETGE:   return X86::COND_AE;
03796   case ISD::SETUGT:              // flipped
03797   case ISD::SETULT:
03798   case ISD::SETLT:   return X86::COND_B;
03799   case ISD::SETUGE:              // flipped
03800   case ISD::SETULE:
03801   case ISD::SETLE:   return X86::COND_BE;
03802   case ISD::SETONE:
03803   case ISD::SETNE:   return X86::COND_NE;
03804   case ISD::SETUO:   return X86::COND_P;
03805   case ISD::SETO:    return X86::COND_NP;
03806   case ISD::SETOEQ:
03807   case ISD::SETUNE:  return X86::COND_INVALID;
03808   }
03809 }
03810 
03811 /// hasFPCMov - is there a floating point cmov for the specific X86 condition
03812 /// code. Current x86 isa includes the following FP cmov instructions:
03813 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
03814 static bool hasFPCMov(unsigned X86CC) {
03815   switch (X86CC) {
03816   default:
03817     return false;
03818   case X86::COND_B:
03819   case X86::COND_BE:
03820   case X86::COND_E:
03821   case X86::COND_P:
03822   case X86::COND_A:
03823   case X86::COND_AE:
03824   case X86::COND_NE:
03825   case X86::COND_NP:
03826     return true;
03827   }
03828 }
03829 
03830 /// isFPImmLegal - Returns true if the target can instruction select the
03831 /// specified FP immediate natively. If false, the legalizer will
03832 /// materialize the FP immediate as a load from a constant pool.
03833 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
03834   for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
03835     if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
03836       return true;
03837   }
03838   return false;
03839 }
03840 
03841 /// \brief Returns true if it is beneficial to convert a load of a constant
03842 /// to just the constant itself.
03843 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
03844                                                           Type *Ty) const {
03845   assert(Ty->isIntegerTy());
03846 
03847   unsigned BitSize = Ty->getPrimitiveSizeInBits();
03848   if (BitSize == 0 || BitSize > 64)
03849     return false;
03850   return true;
03851 }
03852 
03853 /// isUndefOrInRange - Return true if Val is undef or if its value falls within
03854 /// the specified range (L, H].
03855 static bool isUndefOrInRange(int Val, int Low, int Hi) {
03856   return (Val < 0) || (Val >= Low && Val < Hi);
03857 }
03858 
03859 /// isUndefOrEqual - Val is either less than zero (undef) or equal to the
03860 /// specified value.
03861 static bool isUndefOrEqual(int Val, int CmpVal) {
03862   return (Val < 0 || Val == CmpVal);
03863 }
03864 
03865 /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning
03866 /// from position Pos and ending in Pos+Size, falls within the specified
03867 /// sequential range (L, L+Pos]. or is undef.
03868 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
03869                                        unsigned Pos, unsigned Size, int Low) {
03870   for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
03871     if (!isUndefOrEqual(Mask[i], Low))
03872       return false;
03873   return true;
03874 }
03875 
03876 /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that
03877 /// is suitable for input to PSHUFD. That is, it doesn't reference the other
03878 /// operand - by default will match for first operand.
03879 static bool isPSHUFDMask(ArrayRef<int> Mask, MVT VT,
03880                          bool TestSecondOperand = false) {
03881   if (VT != MVT::v4f32 && VT != MVT::v4i32 &&
03882       VT != MVT::v2f64 && VT != MVT::v2i64)
03883     return false;
03884 
03885   unsigned NumElems = VT.getVectorNumElements();
03886   unsigned Lo = TestSecondOperand ? NumElems : 0;
03887   unsigned Hi = Lo + NumElems;
03888 
03889   for (unsigned i = 0; i < NumElems; ++i)
03890     if (!isUndefOrInRange(Mask[i], (int)Lo, (int)Hi))
03891       return false;
03892 
03893   return true;
03894 }
03895 
03896 /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that
03897 /// is suitable for input to PSHUFHW.
03898 static bool isPSHUFHWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03899   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03900     return false;
03901 
03902   // Lower quadword copied in order or undef.
03903   if (!isSequentialOrUndefInRange(Mask, 0, 4, 0))
03904     return false;
03905 
03906   // Upper quadword shuffled.
03907   for (unsigned i = 4; i != 8; ++i)
03908     if (!isUndefOrInRange(Mask[i], 4, 8))
03909       return false;
03910 
03911   if (VT == MVT::v16i16) {
03912     // Lower quadword copied in order or undef.
03913     if (!isSequentialOrUndefInRange(Mask, 8, 4, 8))
03914       return false;
03915 
03916     // Upper quadword shuffled.
03917     for (unsigned i = 12; i != 16; ++i)
03918       if (!isUndefOrInRange(Mask[i], 12, 16))
03919         return false;
03920   }
03921 
03922   return true;
03923 }
03924 
03925 /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that
03926 /// is suitable for input to PSHUFLW.
03927 static bool isPSHUFLWMask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
03928   if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16))
03929     return false;
03930 
03931   // Upper quadword copied in order.
03932   if (!isSequentialOrUndefInRange(Mask, 4, 4, 4))
03933     return false;
03934 
03935   // Lower quadword shuffled.
03936   for (unsigned i = 0; i != 4; ++i)
03937     if (!isUndefOrInRange(Mask[i], 0, 4))
03938       return false;
03939 
03940   if (VT == MVT::v16i16) {
03941     // Upper quadword copied in order.
03942     if (!isSequentialOrUndefInRange(Mask, 12, 4, 12))
03943       return false;
03944 
03945     // Lower quadword shuffled.
03946     for (unsigned i = 8; i != 12; ++i)
03947       if (!isUndefOrInRange(Mask[i], 8, 12))
03948         return false;
03949   }
03950 
03951   return true;
03952 }
03953 
03954 /// \brief Return true if the mask specifies a shuffle of elements that is
03955 /// suitable for input to intralane (palignr) or interlane (valign) vector
03956 /// right-shift.
03957 static bool isAlignrMask(ArrayRef<int> Mask, MVT VT, bool InterLane) {
03958   unsigned NumElts = VT.getVectorNumElements();
03959   unsigned NumLanes = InterLane ? 1: VT.getSizeInBits()/128;
03960   unsigned NumLaneElts = NumElts/NumLanes;
03961 
03962   // Do not handle 64-bit element shuffles with palignr.
03963   if (NumLaneElts == 2)
03964     return false;
03965 
03966   for (unsigned l = 0; l != NumElts; l+=NumLaneElts) {
03967     unsigned i;
03968     for (i = 0; i != NumLaneElts; ++i) {
03969       if (Mask[i+l] >= 0)
03970         break;
03971     }
03972 
03973     // Lane is all undef, go to next lane
03974     if (i == NumLaneElts)
03975       continue;
03976 
03977     int Start = Mask[i+l];
03978 
03979     // Make sure its in this lane in one of the sources
03980     if (!isUndefOrInRange(Start, l, l+NumLaneElts) &&
03981         !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts))
03982       return false;
03983 
03984     // If not lane 0, then we must match lane 0
03985     if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l))
03986       return false;
03987 
03988     // Correct second source to be contiguous with first source
03989     if (Start >= (int)NumElts)
03990       Start -= NumElts - NumLaneElts;
03991 
03992     // Make sure we're shifting in the right direction.
03993     if (Start <= (int)(i+l))
03994       return false;
03995 
03996     Start -= i;
03997 
03998     // Check the rest of the elements to see if they are consecutive.
03999     for (++i; i != NumLaneElts; ++i) {
04000       int Idx = Mask[i+l];
04001 
04002       // Make sure its in this lane
04003       if (!isUndefOrInRange(Idx, l, l+NumLaneElts) &&
04004           !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts))
04005         return false;
04006 
04007       // If not lane 0, then we must match lane 0
04008       if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l))
04009         return false;
04010 
04011       if (Idx >= (int)NumElts)
04012         Idx -= NumElts - NumLaneElts;
04013 
04014       if (!isUndefOrEqual(Idx, Start+i))
04015         return false;
04016 
04017     }
04018   }
04019 
04020   return true;
04021 }
04022 
04023 /// \brief Return true if the node specifies a shuffle of elements that is
04024 /// suitable for input to PALIGNR.
04025 static bool isPALIGNRMask(ArrayRef<int> Mask, MVT VT,
04026                           const X86Subtarget *Subtarget) {
04027   if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
04028       (VT.is256BitVector() && !Subtarget->hasInt256()) ||
04029       VT.is512BitVector())
04030     // FIXME: Add AVX512BW.
04031     return false;
04032 
04033   return isAlignrMask(Mask, VT, false);
04034 }
04035 
04036 /// \brief Return true if the node specifies a shuffle of elements that is
04037 /// suitable for input to VALIGN.
04038 static bool isVALIGNMask(ArrayRef<int> Mask, MVT VT,
04039                           const X86Subtarget *Subtarget) {
04040   // FIXME: Add AVX512VL.
04041   if (!VT.is512BitVector() || !Subtarget->hasAVX512())
04042     return false;
04043   return isAlignrMask(Mask, VT, true);
04044 }
04045 
04046 /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
04047 /// the two vector operands have swapped position.
04048 static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
04049                                      unsigned NumElems) {
04050   for (unsigned i = 0; i != NumElems; ++i) {
04051     int idx = Mask[i];
04052     if (idx < 0)
04053       continue;
04054     else if (idx < (int)NumElems)
04055       Mask[i] = idx + NumElems;
04056     else
04057       Mask[i] = idx - NumElems;
04058   }
04059 }
04060 
04061 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
04062 /// specifies a shuffle of elements that is suitable for input to 128/256-bit
04063 /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be
04064 /// reverse of what x86 shuffles want.
04065 static bool isSHUFPMask(ArrayRef<int> Mask, MVT VT, bool Commuted = false) {
04066 
04067   unsigned NumElems = VT.getVectorNumElements();
04068   unsigned NumLanes = VT.getSizeInBits()/128;
04069   unsigned NumLaneElems = NumElems/NumLanes;
04070 
04071   if (NumLaneElems != 2 && NumLaneElems != 4)
04072     return false;
04073 
04074   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04075   bool symetricMaskRequired =
04076     (VT.getSizeInBits() >= 256) && (EltSize == 32);
04077 
04078   // VSHUFPSY divides the resulting vector into 4 chunks.
04079   // The sources are also splitted into 4 chunks, and each destination
04080   // chunk must come from a different source chunk.
04081   //
04082   //  SRC1 =>   X7    X6    X5    X4    X3    X2    X1    X0
04083   //  SRC2 =>   Y7    Y6    Y5    Y4    Y3    Y2    Y1    Y9
04084   //
04085   //  DST  =>  Y7..Y4,   Y7..Y4,   X7..X4,   X7..X4,
04086   //           Y3..Y0,   Y3..Y0,   X3..X0,   X3..X0
04087   //
04088   // VSHUFPDY divides the resulting vector into 4 chunks.
04089   // The sources are also splitted into 4 chunks, and each destination
04090   // chunk must come from a different source chunk.
04091   //
04092   //  SRC1 =>      X3       X2       X1       X0
04093   //  SRC2 =>      Y3       Y2       Y1       Y0
04094   //
04095   //  DST  =>  Y3..Y2,  X3..X2,  Y1..Y0,  X1..X0
04096   //
04097   SmallVector<int, 4> MaskVal(NumLaneElems, -1);
04098   unsigned HalfLaneElems = NumLaneElems/2;
04099   for (unsigned l = 0; l != NumElems; l += NumLaneElems) {
04100     for (unsigned i = 0; i != NumLaneElems; ++i) {
04101       int Idx = Mask[i+l];
04102       unsigned RngStart = l + ((Commuted == (i<HalfLaneElems)) ? NumElems : 0);
04103       if (!isUndefOrInRange(Idx, RngStart, RngStart+NumLaneElems))
04104         return false;
04105       // For VSHUFPSY, the mask of the second half must be the same as the
04106       // first but with the appropriate offsets. This works in the same way as
04107       // VPERMILPS works with masks.
04108       if (!symetricMaskRequired || Idx < 0)
04109         continue;
04110       if (MaskVal[i] < 0) {
04111         MaskVal[i] = Idx - l;
04112         continue;
04113       }
04114       if ((signed)(Idx - l) != MaskVal[i])
04115         return false;
04116     }
04117   }
04118 
04119   return true;
04120 }
04121 
04122 /// isMOVHLPSMask - Return true if the specified VECTOR_SHUFFLE operand
04123 /// specifies a shuffle of elements that is suitable for input to MOVHLPS.
04124 static bool isMOVHLPSMask(ArrayRef<int> Mask, MVT VT) {
04125   if (!VT.is128BitVector())
04126     return false;
04127 
04128   unsigned NumElems = VT.getVectorNumElements();
04129 
04130   if (NumElems != 4)
04131     return false;
04132 
04133   // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3
04134   return isUndefOrEqual(Mask[0], 6) &&
04135          isUndefOrEqual(Mask[1], 7) &&
04136          isUndefOrEqual(Mask[2], 2) &&
04137          isUndefOrEqual(Mask[3], 3);
04138 }
04139 
04140 /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form
04141 /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef,
04142 /// <2, 3, 2, 3>
04143 static bool isMOVHLPS_v_undef_Mask(ArrayRef<int> Mask, MVT VT) {
04144   if (!VT.is128BitVector())
04145     return false;
04146 
04147   unsigned NumElems = VT.getVectorNumElements();
04148 
04149   if (NumElems != 4)
04150     return false;
04151 
04152   return isUndefOrEqual(Mask[0], 2) &&
04153          isUndefOrEqual(Mask[1], 3) &&
04154          isUndefOrEqual(Mask[2], 2) &&
04155          isUndefOrEqual(Mask[3], 3);
04156 }
04157 
04158 /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand
04159 /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}.
04160 static bool isMOVLPMask(ArrayRef<int> Mask, MVT VT) {
04161   if (!VT.is128BitVector())
04162     return false;
04163 
04164   unsigned NumElems = VT.getVectorNumElements();
04165 
04166   if (NumElems != 2 && NumElems != 4)
04167     return false;
04168 
04169   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04170     if (!isUndefOrEqual(Mask[i], i + NumElems))
04171       return false;
04172 
04173   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
04174     if (!isUndefOrEqual(Mask[i], i))
04175       return false;
04176 
04177   return true;
04178 }
04179 
04180 /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand
04181 /// specifies a shuffle of elements that is suitable for input to MOVLHPS.
04182 static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
04183   if (!VT.is128BitVector())
04184     return false;
04185 
04186   unsigned NumElems = VT.getVectorNumElements();
04187 
04188   if (NumElems != 2 && NumElems != 4)
04189     return false;
04190 
04191   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04192     if (!isUndefOrEqual(Mask[i], i))
04193       return false;
04194 
04195   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
04196     if (!isUndefOrEqual(Mask[i + e], i + NumElems))
04197       return false;
04198 
04199   return true;
04200 }
04201 
04202 /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
04203 /// specifies a shuffle of elements that is suitable for input to INSERTPS.
04204 /// i. e: If all but one element come from the same vector.
04205 static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
04206   // TODO: Deal with AVX's VINSERTPS
04207   if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
04208     return false;
04209 
04210   unsigned CorrectPosV1 = 0;
04211   unsigned CorrectPosV2 = 0;
04212   for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) {
04213     if (Mask[i] == -1) {
04214       ++CorrectPosV1;
04215       ++CorrectPosV2;
04216       continue;
04217     }
04218 
04219     if (Mask[i] == i)
04220       ++CorrectPosV1;
04221     else if (Mask[i] == i + 4)
04222       ++CorrectPosV2;
04223   }
04224 
04225   if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
04226     // We have 3 elements (undefs count as elements from any vector) from one
04227     // vector, and one from another.
04228     return true;
04229 
04230   return false;
04231 }
04232 
04233 //
04234 // Some special combinations that can be optimized.
04235 //
04236 static
04237 SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
04238                                SelectionDAG &DAG) {
04239   MVT VT = SVOp->getSimpleValueType(0);
04240   SDLoc dl(SVOp);
04241 
04242   if (VT != MVT::v8i32 && VT != MVT::v8f32)
04243     return SDValue();
04244 
04245   ArrayRef<int> Mask = SVOp->getMask();
04246 
04247   // These are the special masks that may be optimized.
04248   static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14};
04249   static const int MaskToOptimizeOdd[]  = {1, 9, 3, 11, 5, 13, 7, 15};
04250   bool MatchEvenMask = true;
04251   bool MatchOddMask  = true;
04252   for (int i=0; i<8; ++i) {
04253     if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i]))
04254       MatchEvenMask = false;
04255     if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i]))
04256       MatchOddMask = false;
04257   }
04258 
04259   if (!MatchEvenMask && !MatchOddMask)
04260     return SDValue();
04261 
04262   SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT);
04263 
04264   SDValue Op0 = SVOp->getOperand(0);
04265   SDValue Op1 = SVOp->getOperand(1);
04266 
04267   if (MatchEvenMask) {
04268     // Shift the second operand right to 32 bits.
04269     static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 };
04270     Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask);
04271   } else {
04272     // Shift the first operand left to 32 bits.
04273     static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 };
04274     Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask);
04275   }
04276   static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15};
04277   return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask);
04278 }
04279 
04280 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
04281 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
04282 static bool isUNPCKLMask(ArrayRef<int> Mask, MVT VT,
04283                          bool HasInt256, bool V2IsSplat = false) {
04284 
04285   assert(VT.getSizeInBits() >= 128 &&
04286          "Unsupported vector type for unpckl");
04287 
04288   unsigned NumElts = VT.getVectorNumElements();
04289   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04290       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04291     return false;
04292 
04293   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04294          "Unsupported vector type for unpckh");
04295 
04296   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04297   unsigned NumLanes = VT.getSizeInBits()/128;
04298   unsigned NumLaneElts = NumElts/NumLanes;
04299 
04300   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04301     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04302       int BitI  = Mask[l+i];
04303       int BitI1 = Mask[l+i+1];
04304       if (!isUndefOrEqual(BitI, j))
04305         return false;
04306       if (V2IsSplat) {
04307         if (!isUndefOrEqual(BitI1, NumElts))
04308           return false;
04309       } else {
04310         if (!isUndefOrEqual(BitI1, j + NumElts))
04311           return false;
04312       }
04313     }
04314   }
04315 
04316   return true;
04317 }
04318 
04319 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
04320 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
04321 static bool isUNPCKHMask(ArrayRef<int> Mask, MVT VT,
04322                          bool HasInt256, bool V2IsSplat = false) {
04323   assert(VT.getSizeInBits() >= 128 &&
04324          "Unsupported vector type for unpckh");
04325 
04326   unsigned NumElts = VT.getVectorNumElements();
04327   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04328       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04329     return false;
04330 
04331   assert((!VT.is512BitVector() || VT.getScalarType().getSizeInBits() >= 32) &&
04332          "Unsupported vector type for unpckh");
04333 
04334   // AVX defines UNPCK* to operate independently on 128-bit lanes.
04335   unsigned NumLanes = VT.getSizeInBits()/128;
04336   unsigned NumLaneElts = NumElts/NumLanes;
04337 
04338   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04339     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04340       int BitI  = Mask[l+i];
04341       int BitI1 = Mask[l+i+1];
04342       if (!isUndefOrEqual(BitI, j))
04343         return false;
04344       if (V2IsSplat) {
04345         if (isUndefOrEqual(BitI1, NumElts))
04346           return false;
04347       } else {
04348         if (!isUndefOrEqual(BitI1, j+NumElts))
04349           return false;
04350       }
04351     }
04352   }
04353   return true;
04354 }
04355 
04356 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
04357 /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
04358 /// <0, 0, 1, 1>
04359 static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04360   unsigned NumElts = VT.getVectorNumElements();
04361   bool Is256BitVec = VT.is256BitVector();
04362 
04363   if (VT.is512BitVector())
04364     return false;
04365   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04366          "Unsupported vector type for unpckh");
04367 
04368   if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
04369       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04370     return false;
04371 
04372   // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern
04373   // FIXME: Need a better way to get rid of this, there's no latency difference
04374   // between UNPCKLPD and MOVDDUP, the later should always be checked first and
04375   // the former later. We should also remove the "_undef" special mask.
04376   if (NumElts == 4 && Is256BitVec)
04377     return false;
04378 
04379   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04380   // independently on 128-bit lanes.
04381   unsigned NumLanes = VT.getSizeInBits()/128;
04382   unsigned NumLaneElts = NumElts/NumLanes;
04383 
04384   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04385     for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) {
04386       int BitI  = Mask[l+i];
04387       int BitI1 = Mask[l+i+1];
04388 
04389       if (!isUndefOrEqual(BitI, j))
04390         return false;
04391       if (!isUndefOrEqual(BitI1, j))
04392         return false;
04393     }
04394   }
04395 
04396   return true;
04397 }
04398 
04399 /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form
04400 /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef,
04401 /// <2, 2, 3, 3>
04402 static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
04403   unsigned NumElts = VT.getVectorNumElements();
04404 
04405   if (VT.is512BitVector())
04406     return false;
04407 
04408   assert((VT.is128BitVector() || VT.is256BitVector()) &&
04409          "Unsupported vector type for unpckh");
04410 
04411   if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
04412       (!HasInt256 || (NumElts != 16 && NumElts != 32)))
04413     return false;
04414 
04415   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
04416   // independently on 128-bit lanes.
04417   unsigned NumLanes = VT.getSizeInBits()/128;
04418   unsigned NumLaneElts = NumElts/NumLanes;
04419 
04420   for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
04421     for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) {
04422       int BitI  = Mask[l+i];
04423       int BitI1 = Mask[l+i+1];
04424       if (!isUndefOrEqual(BitI, j))
04425         return false;
04426       if (!isUndefOrEqual(BitI1, j))
04427         return false;
04428     }
04429   }
04430   return true;
04431 }
04432 
04433 // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
04434 // (src1[0], src0[1]), manipulation with 256-bit sub-vectors
04435 static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
04436   if (!VT.is512BitVector())
04437     return false;
04438 
04439   unsigned NumElts = VT.getVectorNumElements();
04440   unsigned HalfSize = NumElts/2;
04441   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
04442     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
04443       *Imm = 1;
04444       return true;
04445     }
04446   }
04447   if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
04448     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
04449       *Imm = 0;
04450       return true;
04451     }
04452   }
04453   return false;
04454 }
04455 
04456 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
04457 /// specifies a shuffle of elements that is suitable for input to MOVSS,
04458 /// MOVSD, and MOVD, i.e. setting the lowest element.
04459 static bool isMOVLMask(ArrayRef<int> Mask, EVT VT) {
04460   if (VT.getVectorElementType().getSizeInBits() < 32)
04461     return false;
04462   if (!VT.is128BitVector())
04463     return false;
04464 
04465   unsigned NumElts = VT.getVectorNumElements();
04466 
04467   if (!isUndefOrEqual(Mask[0], NumElts))
04468     return false;
04469 
04470   for (unsigned i = 1; i != NumElts; ++i)
04471     if (!isUndefOrEqual(Mask[i], i))
04472       return false;
04473 
04474   return true;
04475 }
04476 
04477 /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered
04478 /// as permutations between 128-bit chunks or halves. As an example: this
04479 /// shuffle bellow:
04480 ///   vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15>
04481 /// The first half comes from the second half of V1 and the second half from the
04482 /// the second half of V2.
04483 static bool isVPERM2X128Mask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04484   if (!HasFp256 || !VT.is256BitVector())
04485     return false;
04486 
04487   // The shuffle result is divided into half A and half B. In total the two
04488   // sources have 4 halves, namely: C, D, E, F. The final values of A and
04489   // B must come from C, D, E or F.
04490   unsigned HalfSize = VT.getVectorNumElements()/2;
04491   bool MatchA = false, MatchB = false;
04492 
04493   // Check if A comes from one of C, D, E, F.
04494   for (unsigned Half = 0; Half != 4; ++Half) {
04495     if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) {
04496       MatchA = true;
04497       break;
04498     }
04499   }
04500 
04501   // Check if B comes from one of C, D, E, F.
04502   for (unsigned Half = 0; Half != 4; ++Half) {
04503     if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) {
04504       MatchB = true;
04505       break;
04506     }
04507   }
04508 
04509   return MatchA && MatchB;
04510 }
04511 
04512 /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
04513 /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
04514 static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
04515   MVT VT = SVOp->getSimpleValueType(0);
04516 
04517   unsigned HalfSize = VT.getVectorNumElements()/2;
04518 
04519   unsigned FstHalf = 0, SndHalf = 0;
04520   for (unsigned i = 0; i < HalfSize; ++i) {
04521     if (SVOp->getMaskElt(i) > 0) {
04522       FstHalf = SVOp->getMaskElt(i)/HalfSize;
04523       break;
04524     }
04525   }
04526   for (unsigned i = HalfSize; i < HalfSize*2; ++i) {
04527     if (SVOp->getMaskElt(i) > 0) {
04528       SndHalf = SVOp->getMaskElt(i)/HalfSize;
04529       break;
04530     }
04531   }
04532 
04533   return (FstHalf | (SndHalf << 4));
04534 }
04535 
04536 // Symetric in-lane mask. Each lane has 4 elements (for imm8)
04537 static bool isPermImmMask(ArrayRef<int> Mask, MVT VT, unsigned& Imm8) {
04538   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04539   if (EltSize < 32)
04540     return false;
04541 
04542   unsigned NumElts = VT.getVectorNumElements();
04543   Imm8 = 0;
04544   if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) {
04545     for (unsigned i = 0; i != NumElts; ++i) {
04546       if (Mask[i] < 0)
04547         continue;
04548       Imm8 |= Mask[i] << (i*2);
04549     }
04550     return true;
04551   }
04552 
04553   unsigned LaneSize = 4;
04554   SmallVector<int, 4> MaskVal(LaneSize, -1);
04555 
04556   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04557     for (unsigned i = 0; i != LaneSize; ++i) {
04558       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04559         return false;
04560       if (Mask[i+l] < 0)
04561         continue;
04562       if (MaskVal[i] < 0) {
04563         MaskVal[i] = Mask[i+l] - l;
04564         Imm8 |= MaskVal[i] << (i*2);
04565         continue;
04566       }
04567       if (Mask[i+l] != (signed)(MaskVal[i]+l))
04568         return false;
04569     }
04570   }
04571   return true;
04572 }
04573 
04574 /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand
04575 /// specifies a shuffle of elements that is suitable for input to VPERMILPD*.
04576 /// Note that VPERMIL mask matching is different depending whether theunderlying
04577 /// type is 32 or 64. In the VPERMILPS the high half of the mask should point
04578 /// to the same elements of the low, but to the higher half of the source.
04579 /// In VPERMILPD the two lanes could be shuffled independently of each other
04580 /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY.
04581 static bool isVPERMILPMask(ArrayRef<int> Mask, MVT VT) {
04582   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
04583   if (VT.getSizeInBits() < 256 || EltSize < 32)
04584     return false;
04585   bool symetricMaskRequired = (EltSize == 32);
04586   unsigned NumElts = VT.getVectorNumElements();
04587 
04588   unsigned NumLanes = VT.getSizeInBits()/128;
04589   unsigned LaneSize = NumElts/NumLanes;
04590   // 2 or 4 elements in one lane
04591 
04592   SmallVector<int, 4> ExpectedMaskVal(LaneSize, -1);
04593   for (unsigned l = 0; l != NumElts; l += LaneSize) {
04594     for (unsigned i = 0; i != LaneSize; ++i) {
04595       if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize))
04596         return false;
04597       if (symetricMaskRequired) {
04598         if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) {
04599           ExpectedMaskVal[i] = Mask[i+l] - l;
04600           continue;
04601         }
04602         if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l))
04603           return false;
04604       }
04605     }
04606   }
04607   return true;
04608 }
04609 
04610 /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse
04611 /// of what x86 movss want. X86 movs requires the lowest  element to be lowest
04612 /// element of vector 2 and the other elements to come from vector 1 in order.
04613 static bool isCommutedMOVLMask(ArrayRef<int> Mask, MVT VT,
04614                                bool V2IsSplat = false, bool V2IsUndef = false) {
04615   if (!VT.is128BitVector())
04616     return false;
04617 
04618   unsigned NumOps = VT.getVectorNumElements();
04619   if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16)
04620     return false;
04621 
04622   if (!isUndefOrEqual(Mask[0], 0))
04623     return false;
04624 
04625   for (unsigned i = 1; i != NumOps; ++i)
04626     if (!(isUndefOrEqual(Mask[i], i+NumOps) ||
04627           (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) ||
04628           (V2IsSplat && isUndefOrEqual(Mask[i], NumOps))))
04629       return false;
04630 
04631   return true;
04632 }
04633 
04634 /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04635 /// specifies a shuffle of elements that is suitable for input to MOVSHDUP.
04636 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
04637 static bool isMOVSHDUPMask(ArrayRef<int> Mask, MVT VT,
04638                            const X86Subtarget *Subtarget) {
04639   if (!Subtarget->hasSSE3())
04640     return false;
04641 
04642   unsigned NumElems = VT.getVectorNumElements();
04643 
04644   if ((VT.is128BitVector() && NumElems != 4) ||
04645       (VT.is256BitVector() && NumElems != 8) ||
04646       (VT.is512BitVector() && NumElems != 16))
04647     return false;
04648 
04649   // "i+1" is the value the indexed mask element must have
04650   for (unsigned i = 0; i != NumElems; i += 2)
04651     if (!isUndefOrEqual(Mask[i], i+1) ||
04652         !isUndefOrEqual(Mask[i+1], i+1))
04653       return false;
04654 
04655   return true;
04656 }
04657 
04658 /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04659 /// specifies a shuffle of elements that is suitable for input to MOVSLDUP.
04660 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
04661 static bool isMOVSLDUPMask(ArrayRef<int> Mask, MVT VT,
04662                            const X86Subtarget *Subtarget) {
04663   if (!Subtarget->hasSSE3())
04664     return false;
04665 
04666   unsigned NumElems = VT.getVectorNumElements();
04667 
04668   if ((VT.is128BitVector() && NumElems != 4) ||
04669       (VT.is256BitVector() && NumElems != 8) ||
04670       (VT.is512BitVector() && NumElems != 16))
04671     return false;
04672 
04673   // "i" is the value the indexed mask element must have
04674   for (unsigned i = 0; i != NumElems; i += 2)
04675     if (!isUndefOrEqual(Mask[i], i) ||
04676         !isUndefOrEqual(Mask[i+1], i))
04677       return false;
04678 
04679   return true;
04680 }
04681 
04682 /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand
04683 /// specifies a shuffle of elements that is suitable for input to 256-bit
04684 /// version of MOVDDUP.
04685 static bool isMOVDDUPYMask(ArrayRef<int> Mask, MVT VT, bool HasFp256) {
04686   if (!HasFp256 || !VT.is256BitVector())
04687     return false;
04688 
04689   unsigned NumElts = VT.getVectorNumElements();
04690   if (NumElts != 4)
04691     return false;
04692 
04693   for (unsigned i = 0; i != NumElts/2; ++i)
04694     if (!isUndefOrEqual(Mask[i], 0))
04695       return false;
04696   for (unsigned i = NumElts/2; i != NumElts; ++i)
04697     if (!isUndefOrEqual(Mask[i], NumElts/2))
04698       return false;
04699   return true;
04700 }
04701 
04702 /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand
04703 /// specifies a shuffle of elements that is suitable for input to 128-bit
04704 /// version of MOVDDUP.
04705 static bool isMOVDDUPMask(ArrayRef<int> Mask, MVT VT) {
04706   if (!VT.is128BitVector())
04707     return false;
04708 
04709   unsigned e = VT.getVectorNumElements() / 2;
04710   for (unsigned i = 0; i != e; ++i)
04711     if (!isUndefOrEqual(Mask[i], i))
04712       return false;
04713   for (unsigned i = 0; i != e; ++i)
04714     if (!isUndefOrEqual(Mask[e+i], i))
04715       return false;
04716   return true;
04717 }
04718 
04719 /// isVEXTRACTIndex - Return true if the specified
04720 /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
04721 /// suitable for instruction that extract 128 or 256 bit vectors
04722 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
04723   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04724   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04725     return false;
04726 
04727   // The index should be aligned on a vecWidth-bit boundary.
04728   uint64_t Index =
04729     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04730 
04731   MVT VT = N->getSimpleValueType(0);
04732   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04733   bool Result = (Index * ElSize) % vecWidth == 0;
04734 
04735   return Result;
04736 }
04737 
04738 /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR
04739 /// operand specifies a subvector insert that is suitable for input to
04740 /// insertion of 128 or 256-bit subvectors
04741 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
04742   assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
04743   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04744     return false;
04745   // The index should be aligned on a vecWidth-bit boundary.
04746   uint64_t Index =
04747     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04748 
04749   MVT VT = N->getSimpleValueType(0);
04750   unsigned ElSize = VT.getVectorElementType().getSizeInBits();
04751   bool Result = (Index * ElSize) % vecWidth == 0;
04752 
04753   return Result;
04754 }
04755 
04756 bool X86::isVINSERT128Index(SDNode *N) {
04757   return isVINSERTIndex(N, 128);
04758 }
04759 
04760 bool X86::isVINSERT256Index(SDNode *N) {
04761   return isVINSERTIndex(N, 256);
04762 }
04763 
04764 bool X86::isVEXTRACT128Index(SDNode *N) {
04765   return isVEXTRACTIndex(N, 128);
04766 }
04767 
04768 bool X86::isVEXTRACT256Index(SDNode *N) {
04769   return isVEXTRACTIndex(N, 256);
04770 }
04771 
04772 /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle
04773 /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
04774 /// Handles 128-bit and 256-bit.
04775 static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
04776   MVT VT = N->getSimpleValueType(0);
04777 
04778   assert((VT.getSizeInBits() >= 128) &&
04779          "Unsupported vector type for PSHUF/SHUFP");
04780 
04781   // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate
04782   // independently on 128-bit lanes.
04783   unsigned NumElts = VT.getVectorNumElements();
04784   unsigned NumLanes = VT.getSizeInBits()/128;
04785   unsigned NumLaneElts = NumElts/NumLanes;
04786 
04787   assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) &&
04788          "Only supports 2, 4 or 8 elements per lane");
04789 
04790   unsigned Shift = (NumLaneElts >= 4) ? 1 : 0;
04791   unsigned Mask = 0;
04792   for (unsigned i = 0; i != NumElts; ++i) {
04793     int Elt = N->getMaskElt(i);
04794     if (Elt < 0) continue;
04795     Elt &= NumLaneElts - 1;
04796     unsigned ShAmt = (i << Shift) % 8;
04797     Mask |= Elt << ShAmt;
04798   }
04799 
04800   return Mask;
04801 }
04802 
04803 /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
04804 /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
04805 static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
04806   MVT VT = N->getSimpleValueType(0);
04807 
04808   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04809          "Unsupported vector type for PSHUFHW");
04810 
04811   unsigned NumElts = VT.getVectorNumElements();
04812 
04813   unsigned Mask = 0;
04814   for (unsigned l = 0; l != NumElts; l += 8) {
04815     // 8 nodes per lane, but we only care about the last 4.
04816     for (unsigned i = 0; i < 4; ++i) {
04817       int Elt = N->getMaskElt(l+i+4);
04818       if (Elt < 0) continue;
04819       Elt &= 0x3; // only 2-bits.
04820       Mask |= Elt << (i * 2);
04821     }
04822   }
04823 
04824   return Mask;
04825 }
04826 
04827 /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
04828 /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
04829 static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
04830   MVT VT = N->getSimpleValueType(0);
04831 
04832   assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
04833          "Unsupported vector type for PSHUFHW");
04834 
04835   unsigned NumElts = VT.getVectorNumElements();
04836 
04837   unsigned Mask = 0;
04838   for (unsigned l = 0; l != NumElts; l += 8) {
04839     // 8 nodes per lane, but we only care about the first 4.
04840     for (unsigned i = 0; i < 4; ++i) {
04841       int Elt = N->getMaskElt(l+i);
04842       if (Elt < 0) continue;
04843       Elt &= 0x3; // only 2-bits
04844       Mask |= Elt << (i * 2);
04845     }
04846   }
04847 
04848   return Mask;
04849 }
04850 
04851 /// \brief Return the appropriate immediate to shuffle the specified
04852 /// VECTOR_SHUFFLE mask with the PALIGNR (if InterLane is false) or with
04853 /// VALIGN (if Interlane is true) instructions.
04854 static unsigned getShuffleAlignrImmediate(ShuffleVectorSDNode *SVOp,
04855                                            bool InterLane) {
04856   MVT VT = SVOp->getSimpleValueType(0);
04857   unsigned EltSize = InterLane ? 1 :
04858     VT.getVectorElementType().getSizeInBits() >> 3;
04859 
04860   unsigned NumElts = VT.getVectorNumElements();
04861   unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128;
04862   unsigned NumLaneElts = NumElts/NumLanes;
04863 
04864   int Val = 0;
04865   unsigned i;
04866   for (i = 0; i != NumElts; ++i) {
04867     Val = SVOp->getMaskElt(i);
04868     if (Val >= 0)
04869       break;
04870   }
04871   if (Val >= (int)NumElts)
04872     Val -= NumElts - NumLaneElts;
04873 
04874   assert(Val - i > 0 && "PALIGNR imm should be positive");
04875   return (Val - i) * EltSize;
04876 }
04877 
04878 /// \brief Return the appropriate immediate to shuffle the specified
04879 /// VECTOR_SHUFFLE mask with the PALIGNR instruction.
04880 static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
04881   return getShuffleAlignrImmediate(SVOp, false);
04882 }
04883 
04884 /// \brief Return the appropriate immediate to shuffle the specified
04885 /// VECTOR_SHUFFLE mask with the VALIGN instruction.
04886 static unsigned getShuffleVALIGNImmediate(ShuffleVectorSDNode *SVOp) {
04887   return getShuffleAlignrImmediate(SVOp, true);
04888 }
04889 
04890 
04891 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
04892   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04893   if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
04894     llvm_unreachable("Illegal extract subvector for VEXTRACT");
04895 
04896   uint64_t Index =
04897     cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
04898 
04899   MVT VecVT = N->getOperand(0).getSimpleValueType();
04900   MVT ElVT = VecVT.getVectorElementType();
04901 
04902   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04903   return Index / NumElemsPerChunk;
04904 }
04905 
04906 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
04907   assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
04908   if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
04909     llvm_unreachable("Illegal insert subvector for VINSERT");
04910 
04911   uint64_t Index =
04912     cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
04913 
04914   MVT VecVT = N->getSimpleValueType(0);
04915   MVT ElVT = VecVT.getVectorElementType();
04916 
04917   unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
04918   return Index / NumElemsPerChunk;
04919 }
04920 
04921 /// getExtractVEXTRACT128Immediate - Return the appropriate immediate
04922 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128
04923 /// and VINSERTI128 instructions.
04924 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
04925   return getExtractVEXTRACTImmediate(N, 128);
04926 }
04927 
04928 /// getExtractVEXTRACT256Immediate - Return the appropriate immediate
04929 /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4
04930 /// and VINSERTI64x4 instructions.
04931 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
04932   return getExtractVEXTRACTImmediate(N, 256);
04933 }
04934 
04935 /// getInsertVINSERT128Immediate - Return the appropriate immediate
04936 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128
04937 /// and VINSERTI128 instructions.
04938 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
04939   return getInsertVINSERTImmediate(N, 128);
04940 }
04941 
04942 /// getInsertVINSERT256Immediate - Return the appropriate immediate
04943 /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4
04944 /// and VINSERTI64x4 instructions.
04945 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
04946   return getInsertVINSERTImmediate(N, 256);
04947 }
04948 
04949 /// isZero - Returns true if Elt is a constant integer zero
04950 static bool isZero(SDValue V) {
04951   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
04952   return C && C->isNullValue();
04953 }
04954 
04955 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
04956 /// constant +0.0.
04957 bool X86::isZeroNode(SDValue Elt) {
04958   if (isZero(Elt))
04959     return true;
04960   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
04961     return CFP->getValueAPF().isPosZero();
04962   return false;
04963 }
04964 
04965 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
04966 /// match movhlps. The lower half elements should come from upper half of
04967 /// V1 (and in order), and the upper half elements should come from the upper
04968 /// half of V2 (and in order).
04969 static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
04970   if (!VT.is128BitVector())
04971     return false;
04972   if (VT.getVectorNumElements() != 4)
04973     return false;
04974   for (unsigned i = 0, e = 2; i != e; ++i)
04975     if (!isUndefOrEqual(Mask[i], i+2))
04976       return false;
04977   for (unsigned i = 2; i != 4; ++i)
04978     if (!isUndefOrEqual(Mask[i], i+4))
04979       return false;
04980   return true;
04981 }
04982 
04983 /// isScalarLoadToVector - Returns true if the node is a scalar load that
04984 /// is promoted to a vector. It also returns the LoadSDNode by reference if
04985 /// required.
04986 static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
04987   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
04988     return false;
04989   N = N->getOperand(0).getNode();
04990   if (!ISD::isNON_EXTLoad(N))
04991     return false;
04992   if (LD)
04993     *LD = cast<LoadSDNode>(N);
04994   return true;
04995 }
04996 
04997 // Test whether the given value is a vector value which will be legalized
04998 // into a load.
04999 static bool WillBeConstantPoolLoad(SDNode *N) {
05000   if (N->getOpcode() != ISD::BUILD_VECTOR)
05001     return false;
05002 
05003   // Check for any non-constant elements.
05004   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
05005     switch (N->getOperand(i).getNode()->getOpcode()) {
05006     case ISD::UNDEF:
05007     case ISD::ConstantFP:
05008     case ISD::Constant:
05009       break;
05010     default:
05011       return false;
05012     }
05013 
05014   // Vectors of all-zeros and all-ones are materialized with special
05015   // instructions rather than being loaded.
05016   return !ISD::isBuildVectorAllZeros(N) &&
05017          !ISD::isBuildVectorAllOnes(N);
05018 }
05019 
05020 /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to
05021 /// match movlp{s|d}. The lower half elements should come from lower half of
05022 /// V1 (and in order), and the upper half elements should come from the upper
05023 /// half of V2 (and in order). And since V1 will become the source of the
05024 /// MOVLP, it must be either a vector load or a scalar load to vector.
05025 static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2,
05026                                ArrayRef<int> Mask, MVT VT) {
05027   if (!VT.is128BitVector())
05028     return false;
05029 
05030   if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1))
05031     return false;
05032   // Is V2 is a vector load, don't do this transformation. We will try to use
05033   // load folding shufps op.
05034   if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2))
05035     return false;
05036 
05037   unsigned NumElems = VT.getVectorNumElements();
05038 
05039   if (NumElems != 2 && NumElems != 4)
05040     return false;
05041   for (unsigned i = 0, e = NumElems/2; i != e; ++i)
05042     if (!isUndefOrEqual(Mask[i], i))
05043       return false;
05044   for (unsigned i = NumElems/2, e = NumElems; i != e; ++i)
05045     if (!isUndefOrEqual(Mask[i], i+NumElems))
05046       return false;
05047   return true;
05048 }
05049 
05050 /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved
05051 /// to an zero vector.
05052 /// FIXME: move to dag combiner / method on ShuffleVectorSDNode
05053 static bool isZeroShuffle(ShuffleVectorSDNode *N) {
05054   SDValue V1 = N->getOperand(0);
05055   SDValue V2 = N->getOperand(1);
05056   unsigned NumElems = N->getValueType(0).getVectorNumElements();
05057   for (unsigned i = 0; i != NumElems; ++i) {
05058     int Idx = N->getMaskElt(i);
05059     if (Idx >= (int)NumElems) {
05060       unsigned Opc = V2.getOpcode();
05061       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode()))
05062         continue;
05063       if (Opc != ISD::BUILD_VECTOR ||
05064           !X86::isZeroNode(V2.getOperand(Idx-NumElems)))
05065         return false;
05066     } else if (Idx >= 0) {
05067       unsigned Opc = V1.getOpcode();
05068       if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode()))
05069         continue;
05070       if (Opc != ISD::BUILD_VECTOR ||
05071           !X86::isZeroNode(V1.getOperand(Idx)))
05072         return false;
05073     }
05074   }
05075   return true;
05076 }
05077 
05078 /// getZeroVector - Returns a vector of specified type with all zero elements.
05079 ///
05080 static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
05081                              SelectionDAG &DAG, SDLoc dl) {
05082   assert(VT.isVector() && "Expected a vector type");
05083 
05084   // Always build SSE zero vectors as <4 x i32> bitcasted
05085   // to their dest type. This ensures they get CSE'd.
05086   SDValue Vec;
05087   if (VT.is128BitVector()) {  // SSE
05088     if (Subtarget->hasSSE2()) {  // SSE2
05089       SDValue Cst = DAG.getConstant(0, MVT::i32);
05090       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05091     } else { // SSE1
05092       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05093       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
05094     }
05095   } else if (VT.is256BitVector()) { // AVX
05096     if (Subtarget->hasInt256()) { // AVX2
05097       SDValue Cst = DAG.getConstant(0, MVT::i32);
05098       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05099       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05100     } else {
05101       // 256-bit logic and arithmetic instructions in AVX are all
05102       // floating-point, no support for integer ops. Emit fp zeroed vectors.
05103       SDValue Cst = DAG.getConstantFP(+0.0, MVT::f32);
05104       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05105       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
05106     }
05107   } else if (VT.is512BitVector()) { // AVX-512
05108       SDValue Cst = DAG.getConstant(0, MVT::i32);
05109       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
05110                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05111       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
05112   } else if (VT.getScalarType() == MVT::i1) {
05113     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
05114     SDValue Cst = DAG.getConstant(0, MVT::i1);
05115     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
05116     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
05117   } else
05118     llvm_unreachable("Unexpected vector type");
05119 
05120   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05121 }
05122 
05123 /// getOnesVector - Returns a vector of specified type with all bits set.
05124 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
05125 /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
05126 /// Then bitcast to their original type, ensuring they get CSE'd.
05127 static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
05128                              SDLoc dl) {
05129   assert(VT.isVector() && "Expected a vector type");
05130 
05131   SDValue Cst = DAG.getConstant(~0U, MVT::i32);
05132   SDValue Vec;
05133   if (VT.is256BitVector()) {
05134     if (HasInt256) { // AVX2
05135       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
05136       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
05137     } else { // AVX
05138       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05139       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
05140     }
05141   } else if (VT.is128BitVector()) {
05142     Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
05143   } else
05144     llvm_unreachable("Unexpected vector type");
05145 
05146   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
05147 }
05148 
05149 /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements
05150 /// that point to V2 points to its first element.
05151 static void NormalizeMask(SmallVectorImpl<int> &Mask, unsigned NumElems) {
05152   for (unsigned i = 0; i != NumElems; ++i) {
05153     if (Mask[i] > (int)NumElems) {
05154       Mask[i] = NumElems;
05155     }
05156   }
05157 }
05158 
05159 /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd
05160 /// operation of specified width.
05161 static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1,
05162                        SDValue V2) {
05163   unsigned NumElems = VT.getVectorNumElements();
05164   SmallVector<int, 8> Mask;
05165   Mask.push_back(NumElems);
05166   for (unsigned i = 1; i != NumElems; ++i)
05167     Mask.push_back(i);
05168   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05169 }
05170 
05171 /// getUnpackl - Returns a vector_shuffle node for an unpackl operation.
05172 static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05173                           SDValue V2) {
05174   unsigned NumElems = VT.getVectorNumElements();
05175   SmallVector<int, 8> Mask;
05176   for (unsigned i = 0, e = NumElems/2; i != e; ++i) {
05177     Mask.push_back(i);
05178     Mask.push_back(i + NumElems);
05179   }
05180   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05181 }
05182 
05183 /// getUnpackh - Returns a vector_shuffle node for an unpackh operation.
05184 static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1,
05185                           SDValue V2) {
05186   unsigned NumElems = VT.getVectorNumElements();
05187   SmallVector<int, 8> Mask;
05188   for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) {
05189     Mask.push_back(i + Half);
05190     Mask.push_back(i + NumElems + Half);
05191   }
05192   return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]);
05193 }
05194 
05195 // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by
05196 // a generic shuffle instruction because the target has no such instructions.
05197 // Generate shuffles which repeat i16 and i8 several times until they can be
05198 // represented by v4f32 and then be manipulated by target suported shuffles.
05199 static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
05200   MVT VT = V.getSimpleValueType();
05201   int NumElems = VT.getVectorNumElements();
05202   SDLoc dl(V);
05203 
05204   while (NumElems > 4) {
05205     if (EltNo < NumElems/2) {
05206       V = getUnpackl(DAG, dl, VT, V, V);
05207     } else {
05208       V = getUnpackh(DAG, dl, VT, V, V);
05209       EltNo -= NumElems/2;
05210     }
05211     NumElems >>= 1;
05212   }
05213   return V;
05214 }
05215 
05216 /// getLegalSplat - Generate a legal splat with supported x86 shuffles
05217 static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
05218   MVT VT = V.getSimpleValueType();
05219   SDLoc dl(V);
05220 
05221   if (VT.is128BitVector()) {
05222     V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
05223     int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
05224     V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
05225                              &SplatMask[0]);
05226   } else if (VT.is256BitVector()) {
05227     // To use VPERMILPS to splat scalars, the second half of indicies must
05228     // refer to the higher part, which is a duplication of the lower one,
05229     // because VPERMILPS can only handle in-lane permutations.
05230     int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo,
05231                          EltNo+4, EltNo+4, EltNo+4, EltNo+4 };
05232 
05233     V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V);
05234     V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32),
05235                              &SplatMask[0]);
05236   } else
05237     llvm_unreachable("Vector size not supported");
05238 
05239   return DAG.getNode(ISD::BITCAST, dl, VT, V);
05240 }
05241 
05242 /// PromoteSplat - Splat is promoted to target supported vector shuffles.
05243 static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
05244   MVT SrcVT = SV->getSimpleValueType(0);
05245   SDValue V1 = SV->getOperand(0);
05246   SDLoc dl(SV);
05247 
05248   int EltNo = SV->getSplatIndex();
05249   int NumElems = SrcVT.getVectorNumElements();
05250   bool Is256BitVec = SrcVT.is256BitVector();
05251 
05252   assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
05253          "Unknown how to promote splat for type");
05254 
05255   // Extract the 128-bit part containing the splat element and update
05256   // the splat element index when it refers to the higher register.
05257   if (Is256BitVec) {
05258     V1 = Extract128BitVector(V1, EltNo, DAG, dl);
05259     if (EltNo >= NumElems/2)
05260       EltNo -= NumElems/2;
05261   }
05262 
05263   // All i16 and i8 vector types can't be used directly by a generic shuffle
05264   // instruction because the target has no such instruction. Generate shuffles
05265   // which repeat i16 and i8 several times until they fit in i32, and then can
05266   // be manipulated by target suported shuffles.
05267   MVT EltVT = SrcVT.getVectorElementType();
05268   if (EltVT == MVT::i8 || EltVT == MVT::i16)
05269     V1 = PromoteSplati8i16(V1, DAG, EltNo);
05270 
05271   // Recreate the 256-bit vector and place the same 128-bit vector
05272   // into the low and high part. This is necessary because we want
05273   // to use VPERM* to shuffle the vectors
05274   if (Is256BitVec) {
05275     V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
05276   }
05277 
05278   return getLegalSplat(DAG, V1, EltNo);
05279 }
05280 
05281 /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified
05282 /// vector of zero or undef vector.  This produces a shuffle where the low
05283 /// element of V2 is swizzled into the zero/undef vector, landing at element
05284 /// Idx.  This produces a shuffle mask like 4,1,2,3 (idx=0) or  0,1,2,4 (idx=3).
05285 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx,
05286                                            bool IsZero,
05287                                            const X86Subtarget *Subtarget,
05288                                            SelectionDAG &DAG) {
05289   MVT VT = V2.getSimpleValueType();
05290   SDValue V1 = IsZero
05291     ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
05292   unsigned NumElems = VT.getVectorNumElements();
05293   SmallVector<int, 16> MaskVec;
05294   for (unsigned i = 0; i != NumElems; ++i)
05295     // If this is the insertion idx, put the low elt of V2 here.
05296     MaskVec.push_back(i == Idx ? NumElems : i);
05297   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]);
05298 }
05299 
05300 /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the
05301 /// target specific opcode. Returns true if the Mask could be calculated. Sets
05302 /// IsUnary to true if only uses one source. Note that this will set IsUnary for
05303 /// shuffles which use a single input multiple times, and in those cases it will
05304 /// adjust the mask to only have indices within that single input.
05305 static bool getTargetShuffleMask(SDNode *N, MVT VT,
05306                                  SmallVectorImpl<int> &Mask, bool &IsUnary) {
05307   unsigned NumElems = VT.getVectorNumElements();
05308   SDValue ImmN;
05309 
05310   IsUnary = false;
05311   bool IsFakeUnary = false;
05312   switch(N->getOpcode()) {
05313   case X86ISD::BLENDI:
05314     ImmN = N->getOperand(N->getNumOperands()-1);
05315     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05316     break;
05317   case X86ISD::SHUFP:
05318     ImmN = N->getOperand(N->getNumOperands()-1);
05319     DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05320     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05321     break;
05322   case X86ISD::UNPCKH:
05323     DecodeUNPCKHMask(VT, Mask);
05324     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05325     break;
05326   case X86ISD::UNPCKL:
05327     DecodeUNPCKLMask(VT, Mask);
05328     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05329     break;
05330   case X86ISD::MOVHLPS:
05331     DecodeMOVHLPSMask(NumElems, Mask);
05332     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05333     break;
05334   case X86ISD::MOVLHPS:
05335     DecodeMOVLHPSMask(NumElems, Mask);
05336     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
05337     break;
05338   case X86ISD::PALIGNR:
05339     ImmN = N->getOperand(N->getNumOperands()-1);
05340     DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05341     break;
05342   case X86ISD::PSHUFD:
05343   case X86ISD::VPERMILPI:
05344     ImmN = N->getOperand(N->getNumOperands()-1);
05345     DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05346     IsUnary = true;
05347     break;
05348   case X86ISD::PSHUFHW:
05349     ImmN = N->getOperand(N->getNumOperands()-1);
05350     DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05351     IsUnary = true;
05352     break;
05353   case X86ISD::PSHUFLW:
05354     ImmN = N->getOperand(N->getNumOperands()-1);
05355     DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05356     IsUnary = true;
05357     break;
05358   case X86ISD::PSHUFB: {
05359     IsUnary = true;
05360     SDValue MaskNode = N->getOperand(1);
05361     while (MaskNode->getOpcode() == ISD::BITCAST)
05362       MaskNode = MaskNode->getOperand(0);
05363 
05364     if (MaskNode->getOpcode() == ISD::BUILD_VECTOR) {
05365       // If we have a build-vector, then things are easy.
05366       EVT VT = MaskNode.getValueType();
05367       assert(VT.isVector() &&
05368              "Can't produce a non-vector with a build_vector!");
05369       if (!VT.isInteger())
05370         return false;
05371 
05372       int NumBytesPerElement = VT.getVectorElementType().getSizeInBits() / 8;
05373 
05374       SmallVector<uint64_t, 32> RawMask;
05375       for (int i = 0, e = MaskNode->getNumOperands(); i < e; ++i) {
05376         SDValue Op = MaskNode->getOperand(i);
05377         if (Op->getOpcode() == ISD::UNDEF) {
05378           RawMask.push_back((uint64_t)SM_SentinelUndef);
05379           continue;
05380         }
05381         auto *CN = dyn_cast<ConstantSDNode>(Op.getNode());
05382         if (!CN)
05383           return false;
05384         APInt MaskElement = CN->getAPIntValue();
05385 
05386         // We now have to decode the element which could be any integer size and
05387         // extract each byte of it.
05388         for (int j = 0; j < NumBytesPerElement; ++j) {
05389           // Note that this is x86 and so always little endian: the low byte is
05390           // the first byte of the mask.
05391           RawMask.push_back(MaskElement.getLoBits(8).getZExtValue());
05392           MaskElement = MaskElement.lshr(8);
05393         }
05394       }
05395       DecodePSHUFBMask(RawMask, Mask);
05396       break;
05397     }
05398 
05399     auto *MaskLoad = dyn_cast<LoadSDNode>(MaskNode);
05400     if (!MaskLoad)
05401       return false;
05402 
05403     SDValue Ptr = MaskLoad->getBasePtr();
05404     if (Ptr->getOpcode() == X86ISD::Wrapper)
05405       Ptr = Ptr->getOperand(0);
05406 
05407     auto *MaskCP = dyn_cast<ConstantPoolSDNode>(Ptr);
05408     if (!MaskCP || MaskCP->isMachineConstantPoolEntry())
05409       return false;
05410 
05411     if (auto *C = dyn_cast<Constant>(MaskCP->getConstVal())) {
05412       // FIXME: Support AVX-512 here.
05413       Type *Ty = C->getType();
05414       if (!Ty->isVectorTy() || (Ty->getVectorNumElements() != 16 &&
05415                                 Ty->getVectorNumElements() != 32))
05416         return false;
05417 
05418       DecodePSHUFBMask(C, Mask);
05419       break;
05420     }
05421 
05422     return false;
05423   }
05424   case X86ISD::VPERMI:
05425     ImmN = N->getOperand(N->getNumOperands()-1);
05426     DecodeVPERMMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05427     IsUnary = true;
05428     break;
05429   case X86ISD::MOVSS:
05430   case X86ISD::MOVSD: {
05431     // The index 0 always comes from the first element of the second source,
05432     // this is why MOVSS and MOVSD are used in the first place. The other
05433     // elements come from the other positions of the first source vector
05434     Mask.push_back(NumElems);
05435     for (unsigned i = 1; i != NumElems; ++i) {
05436       Mask.push_back(i);
05437     }
05438     break;
05439   }
05440   case X86ISD::VPERM2X128:
05441     ImmN = N->getOperand(N->getNumOperands()-1);
05442     DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
05443     if (Mask.empty()) return false;
05444     break;
05445   case X86ISD::MOVSLDUP:
05446     DecodeMOVSLDUPMask(VT, Mask);
05447     break;
05448   case X86ISD::MOVSHDUP:
05449     DecodeMOVSHDUPMask(VT, Mask);
05450     break;
05451   case X86ISD::MOVDDUP:
05452   case X86ISD::MOVLHPD:
05453   case X86ISD::MOVLPD:
05454   case X86ISD::MOVLPS:
05455     // Not yet implemented
05456     return false;
05457   default: llvm_unreachable("unknown target shuffle node");
05458   }
05459 
05460   // If we have a fake unary shuffle, the shuffle mask is spread across two
05461   // inputs that are actually the same node. Re-map the mask to always point
05462   // into the first input.
05463   if (IsFakeUnary)
05464     for (int &M : Mask)
05465       if (M >= (int)Mask.size())
05466         M -= Mask.size();
05467 
05468   return true;
05469 }
05470 
05471 /// getShuffleScalarElt - Returns the scalar element that will make up the ith
05472 /// element of the result of the vector shuffle.
05473 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
05474                                    unsigned Depth) {
05475   if (Depth == 6)
05476     return SDValue();  // Limit search depth.
05477 
05478   SDValue V = SDValue(N, 0);
05479   EVT VT = V.getValueType();
05480   unsigned Opcode = V.getOpcode();
05481 
05482   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
05483   if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
05484     int Elt = SV->getMaskElt(Index);
05485 
05486     if (Elt < 0)
05487       return DAG.getUNDEF(VT.getVectorElementType());
05488 
05489     unsigned NumElems = VT.getVectorNumElements();
05490     SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
05491                                          : SV->getOperand(1);
05492     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
05493   }
05494 
05495   // Recurse into target specific vector shuffles to find scalars.
05496   if (isTargetShuffle(Opcode)) {
05497     MVT ShufVT = V.getSimpleValueType();
05498     unsigned NumElems = ShufVT.getVectorNumElements();
05499     SmallVector<int, 16> ShuffleMask;
05500     bool IsUnary;
05501 
05502     if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary))
05503       return SDValue();
05504 
05505     int Elt = ShuffleMask[Index];
05506     if (Elt < 0)
05507       return DAG.getUNDEF(ShufVT.getVectorElementType());
05508 
05509     SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0)
05510                                          : N->getOperand(1);
05511     return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
05512                                Depth+1);
05513   }
05514 
05515   // Actual nodes that may contain scalar elements
05516   if (Opcode == ISD::BITCAST) {
05517     V = V.getOperand(0);
05518     EVT SrcVT = V.getValueType();
05519     unsigned NumElems = VT.getVectorNumElements();
05520 
05521     if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
05522       return SDValue();
05523   }
05524 
05525   if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
05526     return (Index == 0) ? V.getOperand(0)
05527                         : DAG.getUNDEF(VT.getVectorElementType());
05528 
05529   if (V.getOpcode() == ISD::BUILD_VECTOR)
05530     return V.getOperand(Index);
05531 
05532   return SDValue();
05533 }
05534 
05535 /// getNumOfConsecutiveZeros - Return the number of elements of a vector
05536 /// shuffle operation which come from a consecutively from a zero. The
05537 /// search can start in two different directions, from left or right.
05538 /// We count undefs as zeros until PreferredNum is reached.
05539 static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp,
05540                                          unsigned NumElems, bool ZerosFromLeft,
05541                                          SelectionDAG &DAG,
05542                                          unsigned PreferredNum = -1U) {
05543   unsigned NumZeros = 0;
05544   for (unsigned i = 0; i != NumElems; ++i) {
05545     unsigned Index = ZerosFromLeft ? i : NumElems - i - 1;
05546     SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0);
05547     if (!Elt.getNode())
05548       break;
05549 
05550     if (X86::isZeroNode(Elt))
05551       ++NumZeros;
05552     else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum.
05553       NumZeros = std::min(NumZeros + 1, PreferredNum);
05554     else
05555       break;
05556   }
05557 
05558   return NumZeros;
05559 }
05560 
05561 /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE)
05562 /// correspond consecutively to elements from one of the vector operands,
05563 /// starting from its index OpIdx. Also tell OpNum which source vector operand.
05564 static
05565 bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp,
05566                               unsigned MaskI, unsigned MaskE, unsigned OpIdx,
05567                               unsigned NumElems, unsigned &OpNum) {
05568   bool SeenV1 = false;
05569   bool SeenV2 = false;
05570 
05571   for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) {
05572     int Idx = SVOp->getMaskElt(i);
05573     // Ignore undef indicies
05574     if (Idx < 0)
05575       continue;
05576 
05577     if (Idx < (int)NumElems)
05578       SeenV1 = true;
05579     else
05580       SeenV2 = true;
05581 
05582     // Only accept consecutive elements from the same vector
05583     if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2))
05584       return false;
05585   }
05586 
05587   OpNum = SeenV1 ? 0 : 1;
05588   return true;
05589 }
05590 
05591 /// isVectorShiftRight - Returns true if the shuffle can be implemented as a
05592 /// logical left shift of a vector.
05593 static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05594                                bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05595   unsigned NumElems =
05596     SVOp->getSimpleValueType(0).getVectorNumElements();
05597   unsigned NumZeros = getNumOfConsecutiveZeros(
05598       SVOp, NumElems, false /* check zeros from right */, DAG,
05599       SVOp->getMaskElt(0));
05600   unsigned OpSrc;
05601 
05602   if (!NumZeros)
05603     return false;
05604 
05605   // Considering the elements in the mask that are not consecutive zeros,
05606   // check if they consecutively come from only one of the source vectors.
05607   //
05608   //               V1 = {X, A, B, C}     0
05609   //                         \  \  \    /
05610   //   vector_shuffle V1, V2 <1, 2, 3, X>
05611   //
05612   if (!isShuffleMaskConsecutive(SVOp,
05613             0,                   // Mask Start Index
05614             NumElems-NumZeros,   // Mask End Index(exclusive)
05615             NumZeros,            // Where to start looking in the src vector
05616             NumElems,            // Number of elements in vector
05617             OpSrc))              // Which source operand ?
05618     return false;
05619 
05620   isLeft = false;
05621   ShAmt = NumZeros;
05622   ShVal = SVOp->getOperand(OpSrc);
05623   return true;
05624 }
05625 
05626 /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a
05627 /// logical left shift of a vector.
05628 static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05629                               bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05630   unsigned NumElems =
05631     SVOp->getSimpleValueType(0).getVectorNumElements();
05632   unsigned NumZeros = getNumOfConsecutiveZeros(
05633       SVOp, NumElems, true /* check zeros from left */, DAG,
05634       NumElems - SVOp->getMaskElt(NumElems - 1) - 1);
05635   unsigned OpSrc;
05636 
05637   if (!NumZeros)
05638     return false;
05639 
05640   // Considering the elements in the mask that are not consecutive zeros,
05641   // check if they consecutively come from only one of the source vectors.
05642   //
05643   //                           0    { A, B, X, X } = V2
05644   //                          / \    /  /
05645   //   vector_shuffle V1, V2 <X, X, 4, 5>
05646   //
05647   if (!isShuffleMaskConsecutive(SVOp,
05648             NumZeros,     // Mask Start Index
05649             NumElems,     // Mask End Index(exclusive)
05650             0,            // Where to start looking in the src vector
05651             NumElems,     // Number of elements in vector
05652             OpSrc))       // Which source operand ?
05653     return false;
05654 
05655   isLeft = true;
05656   ShAmt = NumZeros;
05657   ShVal = SVOp->getOperand(OpSrc);
05658   return true;
05659 }
05660 
05661 /// isVectorShift - Returns true if the shuffle can be implemented as a
05662 /// logical left or right shift of a vector.
05663 static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG,
05664                           bool &isLeft, SDValue &ShVal, unsigned &ShAmt) {
05665   // Although the logic below support any bitwidth size, there are no
05666   // shift instructions which handle more than 128-bit vectors.
05667   if (!SVOp->getSimpleValueType(0).is128BitVector())
05668     return false;
05669 
05670   if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) ||
05671       isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt))
05672     return true;
05673 
05674   return false;
05675 }
05676 
05677 /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8.
05678 ///
05679 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
05680                                        unsigned NumNonZero, unsigned NumZero,
05681                                        SelectionDAG &DAG,
05682                                        const X86Subtarget* Subtarget,
05683                                        const TargetLowering &TLI) {
05684   if (NumNonZero > 8)
05685     return SDValue();
05686 
05687   SDLoc dl(Op);
05688   SDValue V;
05689   bool First = true;
05690   for (unsigned i = 0; i < 16; ++i) {
05691     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
05692     if (ThisIsNonZero && First) {
05693       if (NumZero)
05694         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05695       else
05696         V = DAG.getUNDEF(MVT::v8i16);
05697       First = false;
05698     }
05699 
05700     if ((i & 1) != 0) {
05701       SDValue ThisElt, LastElt;
05702       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
05703       if (LastIsNonZero) {
05704         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
05705                               MVT::i16, Op.getOperand(i-1));
05706       }
05707       if (ThisIsNonZero) {
05708         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
05709         ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
05710                               ThisElt, DAG.getConstant(8, MVT::i8));
05711         if (LastIsNonZero)
05712           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
05713       } else
05714         ThisElt = LastElt;
05715 
05716       if (ThisElt.getNode())
05717         V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
05718                         DAG.getIntPtrConstant(i/2));
05719     }
05720   }
05721 
05722   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V);
05723 }
05724 
05725 /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16.
05726 ///
05727 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
05728                                      unsigned NumNonZero, unsigned NumZero,
05729                                      SelectionDAG &DAG,
05730                                      const X86Subtarget* Subtarget,
05731                                      const TargetLowering &TLI) {
05732   if (NumNonZero > 4)
05733     return SDValue();
05734 
05735   SDLoc dl(Op);
05736   SDValue V;
05737   bool First = true;
05738   for (unsigned i = 0; i < 8; ++i) {
05739     bool isNonZero = (NonZeros & (1 << i)) != 0;
05740     if (isNonZero) {
05741       if (First) {
05742         if (NumZero)
05743           V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
05744         else
05745           V = DAG.getUNDEF(MVT::v8i16);
05746         First = false;
05747       }
05748       V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
05749                       MVT::v8i16, V, Op.getOperand(i),
05750                       DAG.getIntPtrConstant(i));
05751     }
05752   }
05753 
05754   return V;
05755 }
05756 
05757 /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
05758 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
05759                                      const X86Subtarget *Subtarget,
05760                                      const TargetLowering &TLI) {
05761   // Find all zeroable elements.
05762   bool Zeroable[4];
05763   for (int i=0; i < 4; ++i) {
05764     SDValue Elt = Op->getOperand(i);
05765     Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
05766   }
05767   assert(std::count_if(&Zeroable[0], &Zeroable[4],
05768                        [](bool M) { return !M; }) > 1 &&
05769          "We expect at least two non-zero elements!");
05770 
05771   // We only know how to deal with build_vector nodes where elements are either
05772   // zeroable or extract_vector_elt with constant index.
05773   SDValue FirstNonZero;
05774   for (int i=0; i < 4; ++i) {
05775     if (Zeroable[i])
05776       continue;
05777     SDValue Elt = Op->getOperand(i);
05778     if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
05779         !isa<ConstantSDNode>(Elt.getOperand(1)))
05780       return SDValue();
05781     // Make sure that this node is extracting from a 128-bit vector.
05782     MVT VT = Elt.getOperand(0).getSimpleValueType();
05783     if (!VT.is128BitVector())
05784       return SDValue();
05785     if (!FirstNonZero.getNode())
05786       FirstNonZero = Elt;
05787   }
05788 
05789   assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
05790   SDValue V1 = FirstNonZero.getOperand(0);
05791   MVT VT = V1.getSimpleValueType();
05792 
05793   // See if this build_vector can be lowered as a blend with zero.
05794   SDValue Elt;
05795   unsigned EltMaskIdx, EltIdx;
05796   int Mask[4];
05797   for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
05798     if (Zeroable[EltIdx]) {
05799       // The zero vector will be on the right hand side.
05800       Mask[EltIdx] = EltIdx+4;
05801       continue;
05802     }
05803 
05804     Elt = Op->getOperand(EltIdx);
05805     // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
05806     EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
05807     if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
05808       break;
05809     Mask[EltIdx] = EltIdx;
05810   }
05811 
05812   if (EltIdx == 4) {
05813     // Let the shuffle legalizer deal with blend operations.
05814     SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
05815     if (V1.getSimpleValueType() != VT)
05816       V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
05817     return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
05818   }
05819 
05820   // See if we can lower this build_vector to a INSERTPS.
05821   if (!Subtarget->hasSSE41())
05822     return SDValue();
05823 
05824   SDValue V2 = Elt.getOperand(0);
05825   if (Elt == FirstNonZero)
05826     V1 = SDValue();
05827 
05828   bool CanFold = true;
05829   for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
05830     if (Zeroable[i])
05831       continue;
05832     
05833     SDValue Current = Op->getOperand(i);
05834     SDValue SrcVector = Current->getOperand(0);
05835     if (!V1.getNode())
05836       V1 = SrcVector;
05837     CanFold = SrcVector == V1 &&
05838       cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
05839   }
05840 
05841   if (!CanFold)
05842     return SDValue();
05843 
05844   assert(V1.getNode() && "Expected at least two non-zero elements!");
05845   if (V1.getSimpleValueType() != MVT::v4f32)
05846     V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
05847   if (V2.getSimpleValueType() != MVT::v4f32)
05848     V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
05849 
05850   // Ok, we can emit an INSERTPS instruction.
05851   unsigned ZMask = 0;
05852   for (int i = 0; i < 4; ++i)
05853     if (Zeroable[i])
05854       ZMask |= 1 << i;
05855 
05856   unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
05857   assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
05858   SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
05859                                DAG.getIntPtrConstant(InsertPSMask));
05860   return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
05861 }
05862 
05863 /// getVShift - Return a vector logical shift node.
05864 ///
05865 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
05866                          unsigned NumBits, SelectionDAG &DAG,
05867                          const TargetLowering &TLI, SDLoc dl) {
05868   assert(VT.is128BitVector() && "Unknown type for VShift");
05869   EVT ShVT = MVT::v2i64;
05870   unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
05871   SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp);
05872   return DAG.getNode(ISD::BITCAST, dl, VT,
05873                      DAG.getNode(Opc, dl, ShVT, SrcOp,
05874                              DAG.getConstant(NumBits,
05875                                   TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
05876 }
05877 
05878 static SDValue
05879 LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) {
05880 
05881   // Check if the scalar load can be widened into a vector load. And if
05882   // the address is "base + cst" see if the cst can be "absorbed" into
05883   // the shuffle mask.
05884   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
05885     SDValue Ptr = LD->getBasePtr();
05886     if (!ISD::isNormalLoad(LD) || LD->isVolatile())
05887       return SDValue();
05888     EVT PVT = LD->getValueType(0);
05889     if (PVT != MVT::i32 && PVT != MVT::f32)
05890       return SDValue();
05891 
05892     int FI = -1;
05893     int64_t Offset = 0;
05894     if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
05895       FI = FINode->getIndex();
05896       Offset = 0;
05897     } else if (DAG.isBaseWithConstantOffset(Ptr) &&
05898                isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
05899       FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
05900       Offset = Ptr.getConstantOperandVal(1);
05901       Ptr = Ptr.getOperand(0);
05902     } else {
05903       return SDValue();
05904     }
05905 
05906     // FIXME: 256-bit vector instructions don't require a strict alignment,
05907     // improve this code to support it better.
05908     unsigned RequiredAlign = VT.getSizeInBits()/8;
05909     SDValue Chain = LD->getChain();
05910     // Make sure the stack object alignment is at least 16 or 32.
05911     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
05912     if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
05913       if (MFI->isFixedObjectIndex(FI)) {
05914         // Can't change the alignment. FIXME: It's possible to compute
05915         // the exact stack offset and reference FI + adjust offset instead.
05916         // If someone *really* cares about this. That's the way to implement it.
05917         return SDValue();
05918       } else {
05919         MFI->setObjectAlignment(FI, RequiredAlign);
05920       }
05921     }
05922 
05923     // (Offset % 16 or 32) must be multiple of 4. Then address is then
05924     // Ptr + (Offset & ~15).
05925     if (Offset < 0)
05926       return SDValue();
05927     if ((Offset % RequiredAlign) & 3)
05928       return SDValue();
05929     int64_t StartOffset = Offset & ~(RequiredAlign-1);
05930     if (StartOffset)
05931       Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(),
05932                         Ptr,DAG.getConstant(StartOffset, Ptr.getValueType()));
05933 
05934     int EltNo = (Offset - StartOffset) >> 2;
05935     unsigned NumElems = VT.getVectorNumElements();
05936 
05937     EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
05938     SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
05939                              LD->getPointerInfo().getWithOffset(StartOffset),
05940                              false, false, false, 0);
05941 
05942     SmallVector<int, 8> Mask;
05943     for (unsigned i = 0; i != NumElems; ++i)
05944       Mask.push_back(EltNo);
05945 
05946     return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]);
05947   }
05948 
05949   return SDValue();
05950 }
05951 
05952 /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a
05953 /// vector of type 'VT', see if the elements can be replaced by a single large
05954 /// load which has the same value as a build_vector whose operands are 'elts'.
05955 ///
05956 /// Example: <load i32 *a, load i32 *a+4, undef, undef> -> zextload a
05957 ///
05958 /// FIXME: we'd also like to handle the case where the last elements are zero
05959 /// rather than undef via VZEXT_LOAD, but we do not detect that case today.
05960 /// There's even a handy isZeroNode for that purpose.
05961 static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
05962                                         SDLoc &DL, SelectionDAG &DAG,
05963                                         bool isAfterLegalize) {
05964   EVT EltVT = VT.getVectorElementType();
05965   unsigned NumElems = Elts.size();
05966 
05967   LoadSDNode *LDBase = nullptr;
05968   unsigned LastLoadedElt = -1U;
05969 
05970   // For each element in the initializer, see if we've found a load or an undef.
05971   // If we don't find an initial load element, or later load elements are
05972   // non-consecutive, bail out.
05973   for (unsigned i = 0; i < NumElems; ++i) {
05974     SDValue Elt = Elts[i];
05975 
05976     if (!Elt.getNode() ||
05977         (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode())))
05978       return SDValue();
05979     if (!LDBase) {
05980       if (Elt.getNode()->getOpcode() == ISD::UNDEF)
05981         return SDValue();
05982       LDBase = cast<LoadSDNode>(Elt.getNode());
05983       LastLoadedElt = i;
05984       continue;
05985     }
05986     if (Elt.getOpcode() == ISD::UNDEF)
05987       continue;
05988 
05989     LoadSDNode *LD = cast<LoadSDNode>(Elt);
05990     if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i))
05991       return SDValue();
05992     LastLoadedElt = i;
05993   }
05994 
05995   // If we have found an entire vector of loads and undefs, then return a large
05996   // load of the entire vector width starting at the base pointer.  If we found
05997   // consecutive loads for the low half, generate a vzext_load node.
05998   if (LastLoadedElt == NumElems - 1) {
05999 
06000     if (isAfterLegalize &&
06001         !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT))
06002       return SDValue();
06003 
06004     SDValue NewLd = SDValue();
06005 
06006     if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16)
06007       NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
06008                           LDBase->getPointerInfo(),
06009                           LDBase->isVolatile(), LDBase->isNonTemporal(),
06010                           LDBase->isInvariant(), 0);
06011     NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
06012                         LDBase->getPointerInfo(),
06013                         LDBase->isVolatile(), LDBase->isNonTemporal(),
06014                         LDBase->isInvariant(), LDBase->getAlignment());
06015 
06016     if (LDBase->hasAnyUseOfValue(1)) {
06017       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06018                                      SDValue(LDBase, 1),
06019                                      SDValue(NewLd.getNode(), 1));
06020       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06021       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06022                              SDValue(NewLd.getNode(), 1));
06023     }
06024 
06025     return NewLd;
06026   }
06027   if (NumElems == 4 && LastLoadedElt == 1 &&
06028       DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) {
06029     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
06030     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
06031     SDValue ResNode =
06032         DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
06033                                 LDBase->getPointerInfo(),
06034                                 LDBase->getAlignment(),
06035                                 false/*isVolatile*/, true/*ReadMem*/,
06036                                 false/*WriteMem*/);
06037 
06038     // Make sure the newly-created LOAD is in the same position as LDBase in
06039     // terms of dependency. We create a TokenFactor for LDBase and ResNode, and
06040     // update uses of LDBase's output chain to use the TokenFactor.
06041     if (LDBase->hasAnyUseOfValue(1)) {
06042       SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
06043                              SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1));
06044       DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
06045       DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
06046                              SDValue(ResNode.getNode(), 1));
06047     }
06048 
06049     return DAG.getNode(ISD::BITCAST, DL, VT, ResNode);
06050   }
06051   return SDValue();
06052 }
06053 
06054 /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction
06055 /// to generate a splat value for the following cases:
06056 /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant.
06057 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
06058 /// a scalar load, or a constant.
06059 /// The VBROADCAST node is returned when a pattern is found,
06060 /// or SDValue() otherwise.
06061 static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
06062                                     SelectionDAG &DAG) {
06063   // VBROADCAST requires AVX.
06064   // TODO: Splats could be generated for non-AVX CPUs using SSE
06065   // instructions, but there's less potential gain for only 128-bit vectors.
06066   if (!Subtarget->hasAVX())
06067     return SDValue();
06068 
06069   MVT VT = Op.getSimpleValueType();
06070   SDLoc dl(Op);
06071 
06072   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
06073          "Unsupported vector type for broadcast.");
06074 
06075   SDValue Ld;
06076   bool ConstSplatVal;
06077 
06078   switch (Op.getOpcode()) {
06079     default:
06080       // Unknown pattern found.
06081       return SDValue();
06082 
06083     case ISD::BUILD_VECTOR: {
06084       auto *BVOp = cast<BuildVectorSDNode>(Op.getNode());
06085       BitVector UndefElements;
06086       SDValue Splat = BVOp->getSplatValue(&UndefElements);
06087 
06088       // We need a splat of a single value to use broadcast, and it doesn't
06089       // make any sense if the value is only in one element of the vector.
06090       if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1)
06091         return SDValue();
06092 
06093       Ld = Splat;
06094       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06095                        Ld.getOpcode() == ISD::ConstantFP);
06096 
06097       // Make sure that all of the users of a non-constant load are from the
06098       // BUILD_VECTOR node.
06099       if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
06100         return SDValue();
06101       break;
06102     }
06103 
06104     case ISD::VECTOR_SHUFFLE: {
06105       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
06106 
06107       // Shuffles must have a splat mask where the first element is
06108       // broadcasted.
06109       if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
06110         return SDValue();
06111 
06112       SDValue Sc = Op.getOperand(0);
06113       if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR &&
06114           Sc.getOpcode() != ISD::BUILD_VECTOR) {
06115 
06116         if (!Subtarget->hasInt256())
06117           return SDValue();
06118 
06119         // Use the register form of the broadcast instruction available on AVX2.
06120         if (VT.getSizeInBits() >= 256)
06121           Sc = Extract128BitVector(Sc, 0, DAG, dl);
06122         return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc);
06123       }
06124 
06125       Ld = Sc.getOperand(0);
06126       ConstSplatVal = (Ld.getOpcode() == ISD::Constant ||
06127                        Ld.getOpcode() == ISD::ConstantFP);
06128 
06129       // The scalar_to_vector node and the suspected
06130       // load node must have exactly one user.
06131       // Constants may have multiple users.
06132 
06133       // AVX-512 has register version of the broadcast
06134       bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() &&
06135         Ld.getValueType().getSizeInBits() >= 32;
06136       if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) &&
06137           !hasRegVer))
06138         return SDValue();
06139       break;
06140     }
06141   }
06142 
06143   unsigned ScalarSize = Ld.getValueType().getSizeInBits();
06144   bool IsGE256 = (VT.getSizeInBits() >= 256);
06145 
06146   // When optimizing for size, generate up to 5 extra bytes for a broadcast
06147   // instruction to save 8 or more bytes of constant pool data.
06148   // TODO: If multiple splats are generated to load the same constant,
06149   // it may be detrimental to overall size. There needs to be a way to detect
06150   // that condition to know if this is truly a size win.
06151   const Function *F = DAG.getMachineFunction().getFunction();
06152   bool OptForSize = F->getAttributes().
06153     hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
06154 
06155   // Handle broadcasting a single constant scalar from the constant pool
06156   // into a vector.
06157   // On Sandybridge (no AVX2), it is still better to load a constant vector
06158   // from the constant pool and not to broadcast it from a scalar.
06159   // But override that restriction when optimizing for size.
06160   // TODO: Check if splatting is recommended for other AVX-capable CPUs.
06161   if (ConstSplatVal && (Subtarget->hasAVX2() || OptForSize)) {
06162     EVT CVT = Ld.getValueType();
06163     assert(!CVT.isVector() && "Must not broadcast a vector type");
06164 
06165     // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
06166     // For size optimization, also splat v2f64 and v2i64, and for size opt
06167     // with AVX2, also splat i8 and i16.
06168     // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
06169     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
06170         (OptForSize && (ScalarSize == 64 || Subtarget->hasAVX2()))) {
06171       const Constant *C = nullptr;
06172       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
06173         C = CI->getConstantIntValue();
06174       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
06175         C = CF->getConstantFPValue();
06176 
06177       assert(C && "Invalid constant type");
06178 
06179       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06180       SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy());
06181       unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
06182       Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP,
06183                        MachinePointerInfo::getConstantPool(),
06184                        false, false, false, Alignment);
06185 
06186       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06187     }
06188   }
06189 
06190   bool IsLoad = ISD::isNormalLoad(Ld.getNode());
06191 
06192   // Handle AVX2 in-register broadcasts.
06193   if (!IsLoad && Subtarget->hasInt256() &&
06194       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
06195     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06196 
06197   // The scalar source must be a normal load.
06198   if (!IsLoad)
06199     return SDValue();
06200 
06201   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))
06202     return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06203 
06204   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
06205   // double since there is no vbroadcastsd xmm
06206   if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) {
06207     if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
06208       return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
06209   }
06210 
06211   // Unsupported broadcast.
06212   return SDValue();
06213 }
06214 
06215 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
06216 /// underlying vector and index.
06217 ///
06218 /// Modifies \p ExtractedFromVec to the real vector and returns the real
06219 /// index.
06220 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
06221                                          SDValue ExtIdx) {
06222   int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
06223   if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
06224     return Idx;
06225 
06226   // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
06227   // lowered this:
06228   //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
06229   // to:
06230   //   (extract_vector_elt (vector_shuffle<2,u,u,u>
06231   //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
06232   //                           undef)
06233   //                       Constant<0>)
06234   // In this case the vector is the extract_subvector expression and the index
06235   // is 2, as specified by the shuffle.
06236   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
06237   SDValue ShuffleVec = SVOp->getOperand(0);
06238   MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
06239   assert(ShuffleVecVT.getVectorElementType() ==
06240          ExtractedFromVec.getSimpleValueType().getVectorElementType());
06241 
06242   int ShuffleIdx = SVOp->getMaskElt(Idx);
06243   if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
06244     ExtractedFromVec = ShuffleVec;
06245     return ShuffleIdx;
06246   }
06247   return Idx;
06248 }
06249 
06250 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
06251   MVT VT = Op.getSimpleValueType();
06252 
06253   // Skip if insert_vec_elt is not supported.
06254   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
06255   if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
06256     return SDValue();
06257 
06258   SDLoc DL(Op);
06259   unsigned NumElems = Op.getNumOperands();
06260 
06261   SDValue VecIn1;
06262   SDValue VecIn2;
06263   SmallVector<unsigned, 4> InsertIndices;
06264   SmallVector<int, 8> Mask(NumElems, -1);
06265 
06266   for (unsigned i = 0; i != NumElems; ++i) {
06267     unsigned Opc = Op.getOperand(i).getOpcode();
06268 
06269     if (Opc == ISD::UNDEF)
06270       continue;
06271 
06272     if (Opc != ISD::EXTRACT_VECTOR_ELT) {
06273       // Quit if more than 1 elements need inserting.
06274       if (InsertIndices.size() > 1)
06275         return SDValue();
06276 
06277       InsertIndices.push_back(i);
06278       continue;
06279     }
06280 
06281     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
06282     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
06283     // Quit if non-constant index.
06284     if (!isa<ConstantSDNode>(ExtIdx))
06285       return SDValue();
06286     int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
06287 
06288     // Quit if extracted from vector of different type.
06289     if (ExtractedFromVec.getValueType() != VT)
06290       return SDValue();
06291 
06292     if (!VecIn1.getNode())
06293       VecIn1 = ExtractedFromVec;
06294     else if (VecIn1 != ExtractedFromVec) {
06295       if (!VecIn2.getNode())
06296         VecIn2 = ExtractedFromVec;
06297       else if (VecIn2 != ExtractedFromVec)
06298         // Quit if more than 2 vectors to shuffle
06299         return SDValue();
06300     }
06301 
06302     if (ExtractedFromVec == VecIn1)
06303       Mask[i] = Idx;
06304     else if (ExtractedFromVec == VecIn2)
06305       Mask[i] = Idx + NumElems;
06306   }
06307 
06308   if (!VecIn1.getNode())
06309     return SDValue();
06310 
06311   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
06312   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]);
06313   for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
06314     unsigned Idx = InsertIndices[i];
06315     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
06316                      DAG.getIntPtrConstant(Idx));
06317   }
06318 
06319   return NV;
06320 }
06321 
06322 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
06323 SDValue
06324 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
06325 
06326   MVT VT = Op.getSimpleValueType();
06327   assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) &&
06328          "Unexpected type in LowerBUILD_VECTORvXi1!");
06329 
06330   SDLoc dl(Op);
06331   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06332     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
06333     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06334     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06335   }
06336 
06337   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
06338     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
06339     SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
06340     return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
06341   }
06342 
06343   bool AllContants = true;
06344   uint64_t Immediate = 0;
06345   int NonConstIdx = -1;
06346   bool IsSplat = true;
06347   unsigned NumNonConsts = 0;
06348   unsigned NumConsts = 0;
06349   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
06350     SDValue In = Op.getOperand(idx);
06351     if (In.getOpcode() == ISD::UNDEF)
06352       continue;
06353     if (!isa<ConstantSDNode>(In)) {
06354       AllContants = false;
06355       NonConstIdx = idx;
06356       NumNonConsts++;
06357     }
06358     else {
06359       NumConsts++;
06360       if (cast<ConstantSDNode>(In)->getZExtValue())
06361       Immediate |= (1ULL << idx);
06362     }
06363     if (In != Op.getOperand(0))
06364       IsSplat = false;
06365   }
06366 
06367   if (AllContants) {
06368     SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1,
06369       DAG.getConstant(Immediate, MVT::i16));
06370     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask,
06371                        DAG.getIntPtrConstant(0));
06372   }
06373 
06374   if (NumNonConsts == 1 && NonConstIdx != 0) {
06375     SDValue DstVec;
06376     if (NumConsts) {
06377       SDValue VecAsImm = DAG.getConstant(Immediate,
06378                                          MVT::getIntegerVT(VT.getSizeInBits()));
06379       DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
06380     }
06381     else 
06382       DstVec = DAG.getUNDEF(VT);
06383     return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
06384                        Op.getOperand(NonConstIdx),
06385                        DAG.getIntPtrConstant(NonConstIdx));
06386   }
06387   if (!IsSplat && (NonConstIdx != 0))
06388     llvm_unreachable("Unsupported BUILD_VECTOR operation");
06389   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
06390   SDValue Select;
06391   if (IsSplat)
06392     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06393                           DAG.getConstant(-1, SelectVT),
06394                           DAG.getConstant(0, SelectVT));
06395   else
06396     Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0),
06397                          DAG.getConstant((Immediate | 1), SelectVT),
06398                          DAG.getConstant(Immediate, SelectVT));
06399   return DAG.getNode(ISD::BITCAST, dl, VT, Select);
06400 }
06401 
06402 /// \brief Return true if \p N implements a horizontal binop and return the
06403 /// operands for the horizontal binop into V0 and V1.
06404 /// 
06405 /// This is a helper function of PerformBUILD_VECTORCombine.
06406 /// This function checks that the build_vector \p N in input implements a
06407 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
06408 /// operation to match.
06409 /// For example, if \p Opcode is equal to ISD::ADD, then this function
06410 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
06411 /// is equal to ISD::SUB, then this function checks if this is a horizontal
06412 /// arithmetic sub.
06413 ///
06414 /// This function only analyzes elements of \p N whose indices are
06415 /// in range [BaseIdx, LastIdx).
06416 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
06417                               SelectionDAG &DAG,
06418                               unsigned BaseIdx, unsigned LastIdx,
06419                               SDValue &V0, SDValue &V1) {
06420   EVT VT = N->getValueType(0);
06421 
06422   assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
06423   assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
06424          "Invalid Vector in input!");
06425   
06426   bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
06427   bool CanFold = true;
06428   unsigned ExpectedVExtractIdx = BaseIdx;
06429   unsigned NumElts = LastIdx - BaseIdx;
06430   V0 = DAG.getUNDEF(VT);
06431   V1 = DAG.getUNDEF(VT);
06432 
06433   // Check if N implements a horizontal binop.
06434   for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
06435     SDValue Op = N->getOperand(i + BaseIdx);
06436 
06437     // Skip UNDEFs.
06438     if (Op->getOpcode() == ISD::UNDEF) {
06439       // Update the expected vector extract index.
06440       if (i * 2 == NumElts)
06441         ExpectedVExtractIdx = BaseIdx;
06442       ExpectedVExtractIdx += 2;
06443       continue;
06444     }
06445 
06446     CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
06447 
06448     if (!CanFold)
06449       break;
06450 
06451     SDValue Op0 = Op.getOperand(0);
06452     SDValue Op1 = Op.getOperand(1);
06453 
06454     // Try to match the following pattern:
06455     // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
06456     CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06457         Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
06458         Op0.getOperand(0) == Op1.getOperand(0) &&
06459         isa<ConstantSDNode>(Op0.getOperand(1)) &&
06460         isa<ConstantSDNode>(Op1.getOperand(1)));
06461     if (!CanFold)
06462       break;
06463 
06464     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06465     unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
06466 
06467     if (i * 2 < NumElts) {
06468       if (V0.getOpcode() == ISD::UNDEF)
06469         V0 = Op0.getOperand(0);
06470     } else {
06471       if (V1.getOpcode() == ISD::UNDEF)
06472         V1 = Op0.getOperand(0);
06473       if (i * 2 == NumElts)
06474         ExpectedVExtractIdx = BaseIdx;
06475     }
06476 
06477     SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
06478     if (I0 == ExpectedVExtractIdx)
06479       CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
06480     else if (IsCommutable && I1 == ExpectedVExtractIdx) {
06481       // Try to match the following dag sequence:
06482       // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
06483       CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
06484     } else
06485       CanFold = false;
06486 
06487     ExpectedVExtractIdx += 2;
06488   }
06489 
06490   return CanFold;
06491 }
06492 
06493 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
06494 /// a concat_vector. 
06495 ///
06496 /// This is a helper function of PerformBUILD_VECTORCombine.
06497 /// This function expects two 256-bit vectors called V0 and V1.
06498 /// At first, each vector is split into two separate 128-bit vectors.
06499 /// Then, the resulting 128-bit vectors are used to implement two
06500 /// horizontal binary operations. 
06501 ///
06502 /// The kind of horizontal binary operation is defined by \p X86Opcode.
06503 ///
06504 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
06505 /// the two new horizontal binop.
06506 /// When Mode is set, the first horizontal binop dag node would take as input
06507 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
06508 /// horizontal binop dag node would take as input the lower 128-bit of V1
06509 /// and the upper 128-bit of V1.
06510 ///   Example:
06511 ///     HADD V0_LO, V0_HI
06512 ///     HADD V1_LO, V1_HI
06513 ///
06514 /// Otherwise, the first horizontal binop dag node takes as input the lower
06515 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
06516 /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1.
06517 ///   Example:
06518 ///     HADD V0_LO, V1_LO
06519 ///     HADD V0_HI, V1_HI
06520 ///
06521 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
06522 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
06523 /// the upper 128-bits of the result.
06524 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
06525                                      SDLoc DL, SelectionDAG &DAG,
06526                                      unsigned X86Opcode, bool Mode,
06527                                      bool isUndefLO, bool isUndefHI) {
06528   EVT VT = V0.getValueType();
06529   assert(VT.is256BitVector() && VT == V1.getValueType() &&
06530          "Invalid nodes in input!");
06531 
06532   unsigned NumElts = VT.getVectorNumElements();
06533   SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL);
06534   SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL);
06535   SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL);
06536   SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL);
06537   EVT NewVT = V0_LO.getValueType();
06538 
06539   SDValue LO = DAG.getUNDEF(NewVT);
06540   SDValue HI = DAG.getUNDEF(NewVT);
06541 
06542   if (Mode) {
06543     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06544     if (!isUndefLO && V0->getOpcode() != ISD::UNDEF)
06545       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
06546     if (!isUndefHI && V1->getOpcode() != ISD::UNDEF)
06547       HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
06548   } else {
06549     // Don't emit a horizontal binop if the result is expected to be UNDEF.
06550     if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF ||
06551                        V1_LO->getOpcode() != ISD::UNDEF))
06552       LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
06553 
06554     if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF ||
06555                        V1_HI->getOpcode() != ISD::UNDEF))
06556       HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
06557   }
06558 
06559   return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
06560 }
06561 
06562 /// \brief Try to fold a build_vector that performs an 'addsub' into the
06563 /// sequence of 'vadd + vsub + blendi'.
06564 static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG,
06565                            const X86Subtarget *Subtarget) {
06566   SDLoc DL(BV);
06567   EVT VT = BV->getValueType(0);
06568   unsigned NumElts = VT.getVectorNumElements();
06569   SDValue InVec0 = DAG.getUNDEF(VT);
06570   SDValue InVec1 = DAG.getUNDEF(VT);
06571 
06572   assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
06573           VT == MVT::v2f64) && "build_vector with an invalid type found!");
06574 
06575   // Odd-numbered elements in the input build vector are obtained from
06576   // adding two integer/float elements.
06577   // Even-numbered elements in the input build vector are obtained from
06578   // subtracting two integer/float elements.
06579   unsigned ExpectedOpcode = ISD::FSUB;
06580   unsigned NextExpectedOpcode = ISD::FADD;
06581   bool AddFound = false;
06582   bool SubFound = false;
06583 
06584   for (unsigned i = 0, e = NumElts; i != e; i++) {
06585     SDValue Op = BV->getOperand(i);
06586 
06587     // Skip 'undef' values.
06588     unsigned Opcode = Op.getOpcode();
06589     if (Opcode == ISD::UNDEF) {
06590       std::swap(ExpectedOpcode, NextExpectedOpcode);
06591       continue;
06592     }
06593 
06594     // Early exit if we found an unexpected opcode.
06595     if (Opcode != ExpectedOpcode)
06596       return SDValue();
06597 
06598     SDValue Op0 = Op.getOperand(0);
06599     SDValue Op1 = Op.getOperand(1);
06600 
06601     // Try to match the following pattern:
06602     // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
06603     // Early exit if we cannot match that sequence.
06604     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06605         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
06606         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
06607         !isa<ConstantSDNode>(Op1.getOperand(1)) ||
06608         Op0.getOperand(1) != Op1.getOperand(1))
06609       return SDValue();
06610 
06611     unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
06612     if (I0 != i)
06613       return SDValue();
06614 
06615     // We found a valid add/sub node. Update the information accordingly.
06616     if (i & 1)
06617       AddFound = true;
06618     else
06619       SubFound = true;
06620 
06621     // Update InVec0 and InVec1.
06622     if (InVec0.getOpcode() == ISD::UNDEF)
06623       InVec0 = Op0.getOperand(0);
06624     if (InVec1.getOpcode() == ISD::UNDEF)
06625       InVec1 = Op1.getOperand(0);
06626 
06627     // Make sure that operands in input to each add/sub node always
06628     // come from a same pair of vectors.
06629     if (InVec0 != Op0.getOperand(0)) {
06630       if (ExpectedOpcode == ISD::FSUB)
06631         return SDValue();
06632 
06633       // FADD is commutable. Try to commute the operands
06634       // and then test again.
06635       std::swap(Op0, Op1);
06636       if (InVec0 != Op0.getOperand(0))
06637         return SDValue();
06638     }
06639 
06640     if (InVec1 != Op1.getOperand(0))
06641       return SDValue();
06642 
06643     // Update the pair of expected opcodes.
06644     std::swap(ExpectedOpcode, NextExpectedOpcode);
06645   }
06646 
06647   // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
06648   if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF &&
06649       InVec1.getOpcode() != ISD::UNDEF)
06650     return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
06651 
06652   return SDValue();
06653 }
06654 
06655 static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
06656                                           const X86Subtarget *Subtarget) {
06657   SDLoc DL(N);
06658   EVT VT = N->getValueType(0);
06659   unsigned NumElts = VT.getVectorNumElements();
06660   BuildVectorSDNode *BV = cast<BuildVectorSDNode>(N);
06661   SDValue InVec0, InVec1;
06662 
06663   // Try to match an ADDSUB.
06664   if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
06665       (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) {
06666     SDValue Value = matchAddSub(BV, DAG, Subtarget);
06667     if (Value.getNode())
06668       return Value;
06669   }
06670 
06671   // Try to match horizontal ADD/SUB.
06672   unsigned NumUndefsLO = 0;
06673   unsigned NumUndefsHI = 0;
06674   unsigned Half = NumElts/2;
06675 
06676   // Count the number of UNDEF operands in the build_vector in input.
06677   for (unsigned i = 0, e = Half; i != e; ++i)
06678     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06679       NumUndefsLO++;
06680 
06681   for (unsigned i = Half, e = NumElts; i != e; ++i)
06682     if (BV->getOperand(i)->getOpcode() == ISD::UNDEF)
06683       NumUndefsHI++;
06684 
06685   // Early exit if this is either a build_vector of all UNDEFs or all the
06686   // operands but one are UNDEF.
06687   if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
06688     return SDValue();
06689 
06690   if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) {
06691     // Try to match an SSE3 float HADD/HSUB.
06692     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06693       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06694     
06695     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06696       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06697   } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) {
06698     // Try to match an SSSE3 integer HADD/HSUB.
06699     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06700       return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
06701     
06702     if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06703       return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
06704   }
06705   
06706   if (!Subtarget->hasAVX())
06707     return SDValue();
06708 
06709   if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
06710     // Try to match an AVX horizontal add/sub of packed single/double
06711     // precision floating point values from 256-bit vectors.
06712     SDValue InVec2, InVec3;
06713     if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
06714         isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
06715         ((InVec0.getOpcode() == ISD::UNDEF ||
06716           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06717         ((InVec1.getOpcode() == ISD::UNDEF ||
06718           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06719       return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
06720 
06721     if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
06722         isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
06723         ((InVec0.getOpcode() == ISD::UNDEF ||
06724           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06725         ((InVec1.getOpcode() == ISD::UNDEF ||
06726           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06727       return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
06728   } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
06729     // Try to match an AVX2 horizontal add/sub of signed integers.
06730     SDValue InVec2, InVec3;
06731     unsigned X86Opcode;
06732     bool CanFold = true;
06733 
06734     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
06735         isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
06736         ((InVec0.getOpcode() == ISD::UNDEF ||
06737           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06738         ((InVec1.getOpcode() == ISD::UNDEF ||
06739           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06740       X86Opcode = X86ISD::HADD;
06741     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
06742         isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
06743         ((InVec0.getOpcode() == ISD::UNDEF ||
06744           InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) &&
06745         ((InVec1.getOpcode() == ISD::UNDEF ||
06746           InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3))
06747       X86Opcode = X86ISD::HSUB;
06748     else
06749       CanFold = false;
06750 
06751     if (CanFold) {
06752       // Fold this build_vector into a single horizontal add/sub.
06753       // Do this only if the target has AVX2.
06754       if (Subtarget->hasAVX2())
06755         return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
06756  
06757       // Do not try to expand this build_vector into a pair of horizontal
06758       // add/sub if we can emit a pair of scalar add/sub.
06759       if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06760         return SDValue();
06761 
06762       // Convert this build_vector into a pair of horizontal binop followed by
06763       // a concat vector.
06764       bool isUndefLO = NumUndefsLO == Half;
06765       bool isUndefHI = NumUndefsHI == Half;
06766       return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
06767                                    isUndefLO, isUndefHI);
06768     }
06769   }
06770 
06771   if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
06772        VT == MVT::v16i16) && Subtarget->hasAVX()) {
06773     unsigned X86Opcode;
06774     if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
06775       X86Opcode = X86ISD::HADD;
06776     else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
06777       X86Opcode = X86ISD::HSUB;
06778     else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
06779       X86Opcode = X86ISD::FHADD;
06780     else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
06781       X86Opcode = X86ISD::FHSUB;
06782     else
06783       return SDValue();
06784 
06785     // Don't try to expand this build_vector into a pair of horizontal add/sub
06786     // if we can simply emit a pair of scalar add/sub.
06787     if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
06788       return SDValue();
06789 
06790     // Convert this build_vector into two horizontal add/sub followed by
06791     // a concat vector.
06792     bool isUndefLO = NumUndefsLO == Half;
06793     bool isUndefHI = NumUndefsHI == Half;
06794     return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
06795                                  isUndefLO, isUndefHI);
06796   }
06797 
06798   return SDValue();
06799 }
06800 
06801 SDValue
06802 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
06803   SDLoc dl(Op);
06804 
06805   MVT VT = Op.getSimpleValueType();
06806   MVT ExtVT = VT.getVectorElementType();
06807   unsigned NumElems = Op.getNumOperands();
06808 
06809   // Generate vectors for predicate vectors.
06810   if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512())
06811     return LowerBUILD_VECTORvXi1(Op, DAG);
06812 
06813   // Vectors containing all zeros can be matched by pxor and xorps later
06814   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
06815     // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
06816     // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
06817     if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
06818       return Op;
06819 
06820     return getZeroVector(VT, Subtarget, DAG, dl);
06821   }
06822 
06823   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
06824   // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
06825   // vpcmpeqd on 256-bit vectors.
06826   if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
06827     if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
06828       return Op;
06829 
06830     if (!VT.is512BitVector())
06831       return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl);
06832   }
06833 
06834   SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG);
06835   if (Broadcast.getNode())
06836     return Broadcast;
06837 
06838   unsigned EVTBits = ExtVT.getSizeInBits();
06839 
06840   unsigned NumZero  = 0;
06841   unsigned NumNonZero = 0;
06842   unsigned NonZeros = 0;
06843   bool IsAllConstants = true;
06844   SmallSet<SDValue, 8> Values;
06845   for (unsigned i = 0; i < NumElems; ++i) {
06846     SDValue Elt = Op.getOperand(i);
06847     if (Elt.getOpcode() == ISD::UNDEF)
06848       continue;
06849     Values.insert(Elt);
06850     if (Elt.getOpcode() != ISD::Constant &&
06851         Elt.getOpcode() != ISD::ConstantFP)
06852       IsAllConstants = false;
06853     if (X86::isZeroNode(Elt))
06854       NumZero++;
06855     else {
06856       NonZeros |= (1 << i);
06857       NumNonZero++;
06858     }
06859   }
06860 
06861   // All undef vector. Return an UNDEF.  All zero vectors were handled above.
06862   if (NumNonZero == 0)
06863     return DAG.getUNDEF(VT);
06864 
06865   // Special case for single non-zero, non-undef, element.
06866   if (NumNonZero == 1) {
06867     unsigned Idx = countTrailingZeros(NonZeros);
06868     SDValue Item = Op.getOperand(Idx);
06869 
06870     // If this is an insertion of an i64 value on x86-32, and if the top bits of
06871     // the value are obviously zero, truncate the value to i32 and do the
06872     // insertion that way.  Only do this if the value is non-constant or if the
06873     // value is a constant being inserted into element 0.  It is cheaper to do
06874     // a constant pool load than it is to do a movd + shuffle.
06875     if (ExtVT == MVT::i64 && !Subtarget->is64Bit() &&
06876         (!IsAllConstants || Idx == 0)) {
06877       if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
06878         // Handle SSE only.
06879         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
06880         EVT VecVT = MVT::v4i32;
06881         unsigned VecElts = 4;
06882 
06883         // Truncate the value (which may itself be a constant) to i32, and
06884         // convert it to a vector with movd (S2V+shuffle to zero extend).
06885         Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
06886         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
06887 
06888         // If using the new shuffle lowering, just directly insert this.
06889         if (ExperimentalVectorShuffleLowering)
06890           return DAG.getNode(
06891               ISD::BITCAST, dl, VT,
06892               getShuffleVectorZeroOrUndef(Item, Idx * 2, true, Subtarget, DAG));
06893 
06894         Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06895 
06896         // Now we have our 32-bit value zero extended in the low element of
06897         // a vector.  If Idx != 0, swizzle it into place.
06898         if (Idx != 0) {
06899           SmallVector<int, 4> Mask;
06900           Mask.push_back(Idx);
06901           for (unsigned i = 1; i != VecElts; ++i)
06902             Mask.push_back(i);
06903           Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT),
06904                                       &Mask[0]);
06905         }
06906         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06907       }
06908     }
06909 
06910     // If we have a constant or non-constant insertion into the low element of
06911     // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
06912     // the rest of the elements.  This will be matched as movd/movq/movss/movsd
06913     // depending on what the source datatype is.
06914     if (Idx == 0) {
06915       if (NumZero == 0)
06916         return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06917 
06918       if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
06919           (ExtVT == MVT::i64 && Subtarget->is64Bit())) {
06920         if (VT.is256BitVector() || VT.is512BitVector()) {
06921           SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
06922           return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
06923                              Item, DAG.getIntPtrConstant(0));
06924         }
06925         assert(VT.is128BitVector() && "Expected an SSE value type!");
06926         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06927         // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
06928         return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06929       }
06930 
06931       if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
06932         Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
06933         Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
06934         if (VT.is256BitVector()) {
06935           SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl);
06936           Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl);
06937         } else {
06938           assert(VT.is128BitVector() && "Expected an SSE value type!");
06939           Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
06940         }
06941         return DAG.getNode(ISD::BITCAST, dl, VT, Item);
06942       }
06943     }
06944 
06945     // Is it a vector logical left shift?
06946     if (NumElems == 2 && Idx == 1 &&
06947         X86::isZeroNode(Op.getOperand(0)) &&
06948         !X86::isZeroNode(Op.getOperand(1))) {
06949       unsigned NumBits = VT.getSizeInBits();
06950       return getVShift(true, VT,
06951                        DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
06952                                    VT, Op.getOperand(1)),
06953                        NumBits/2, DAG, *this, dl);
06954     }
06955 
06956     if (IsAllConstants) // Otherwise, it's better to do a constpool load.
06957       return SDValue();
06958 
06959     // Otherwise, if this is a vector with i32 or f32 elements, and the element
06960     // is a non-constant being inserted into an element other than the low one,
06961     // we can't use a constant pool load.  Instead, use SCALAR_TO_VECTOR (aka
06962     // movd/movss) to move this into the low element, then shuffle it into
06963     // place.
06964     if (EVTBits == 32) {
06965       Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
06966 
06967       // If using the new shuffle lowering, just directly insert this.
06968       if (ExperimentalVectorShuffleLowering)
06969         return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
06970 
06971       // Turn it into a shuffle of zero and zero-extended scalar to vector.
06972       Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG);
06973       SmallVector<int, 8> MaskVec;
06974       for (unsigned i = 0; i != NumElems; ++i)
06975         MaskVec.push_back(i == Idx ? 0 : 1);
06976       return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]);
06977     }
06978   }
06979 
06980   // Splat is obviously ok. Let legalizer expand it to a shuffle.
06981   if (Values.size() == 1) {
06982     if (EVTBits == 32) {
06983       // Instead of a shuffle like this:
06984       // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
06985       // Check if it's possible to issue this instead.
06986       // shuffle (vload ptr)), undef, <1, 1, 1, 1>
06987       unsigned Idx = countTrailingZeros(NonZeros);
06988       SDValue Item = Op.getOperand(Idx);
06989       if (Op.getNode()->isOnlyUserOf(Item.getNode()))
06990         return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
06991     }
06992     return SDValue();
06993   }
06994 
06995   // A vector full of immediates; various special cases are already
06996   // handled, so this is best done with a single constant-pool load.
06997   if (IsAllConstants)
06998     return SDValue();
06999 
07000   // For AVX-length vectors, build the individual 128-bit pieces and use
07001   // shuffles to put them in place.
07002   if (VT.is256BitVector() || VT.is512BitVector()) {
07003     SmallVector<SDValue, 64> V;
07004     for (unsigned i = 0; i != NumElems; ++i)
07005       V.push_back(Op.getOperand(i));
07006 
07007     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
07008 
07009     // Build both the lower and upper subvector.
07010     SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07011                                 makeArrayRef(&V[0], NumElems/2));
07012     SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
07013                                 makeArrayRef(&V[NumElems / 2], NumElems/2));
07014 
07015     // Recreate the wider vector with the lower and upper part.
07016     if (VT.is256BitVector())
07017       return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07018     return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
07019   }
07020 
07021   // Let legalizer expand 2-wide build_vectors.
07022   if (EVTBits == 64) {
07023     if (NumNonZero == 1) {
07024       // One half is zero or undef.
07025       unsigned Idx = countTrailingZeros(NonZeros);
07026       SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
07027                                  Op.getOperand(Idx));
07028       return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
07029     }
07030     return SDValue();
07031   }
07032 
07033   // If element VT is < 32 bits, convert it to inserts into a zero vector.
07034   if (EVTBits == 8 && NumElems == 16) {
07035     SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG,
07036                                         Subtarget, *this);
07037     if (V.getNode()) return V;
07038   }
07039 
07040   if (EVTBits == 16 && NumElems == 8) {
07041     SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG,
07042                                       Subtarget, *this);
07043     if (V.getNode()) return V;
07044   }
07045 
07046   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
07047   if (EVTBits == 32 && NumElems == 4) {
07048     SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
07049     if (V.getNode())
07050       return V;
07051   }
07052 
07053   // If element VT is == 32 bits, turn it into a number of shuffles.
07054   SmallVector<SDValue, 8> V(NumElems);
07055   if (NumElems == 4 && NumZero > 0) {
07056     for (unsigned i = 0; i < 4; ++i) {
07057       bool isZero = !(NonZeros & (1 << i));
07058       if (isZero)
07059         V[i] = getZeroVector(VT, Subtarget, DAG, dl);
07060       else
07061         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07062     }
07063 
07064     for (unsigned i = 0; i < 2; ++i) {
07065       switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
07066         default: break;
07067         case 0:
07068           V[i] = V[i*2];  // Must be a zero vector.
07069           break;
07070         case 1:
07071           V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]);
07072           break;
07073         case 2:
07074           V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]);
07075           break;
07076         case 3:
07077           V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]);
07078           break;
07079       }
07080     }
07081 
07082     bool Reverse1 = (NonZeros & 0x3) == 2;
07083     bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
07084     int MaskVec[] = {
07085       Reverse1 ? 1 : 0,
07086       Reverse1 ? 0 : 1,
07087       static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
07088       static_cast<int>(Reverse2 ? NumElems   : NumElems+1)
07089     };
07090     return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]);
07091   }
07092 
07093   if (Values.size() > 1 && VT.is128BitVector()) {
07094     // Check for a build vector of consecutive loads.
07095     for (unsigned i = 0; i < NumElems; ++i)
07096       V[i] = Op.getOperand(i);
07097 
07098     // Check for elements which are consecutive loads.
07099     SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false);
07100     if (LD.getNode())
07101       return LD;
07102 
07103     // Check for a build vector from mostly shuffle plus few inserting.
07104     SDValue Sh = buildFromShuffleMostly(Op, DAG);
07105     if (Sh.getNode())
07106       return Sh;
07107 
07108     // For SSE 4.1, use insertps to put the high elements into the low element.
07109     if (getSubtarget()->hasSSE41()) {
07110       SDValue Result;
07111       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
07112         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
07113       else
07114         Result = DAG.getUNDEF(VT);
07115 
07116       for (unsigned i = 1; i < NumElems; ++i) {
07117         if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue;
07118         Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
07119                              Op.getOperand(i), DAG.getIntPtrConstant(i));
07120       }
07121       return Result;
07122     }
07123 
07124     // Otherwise, expand into a number of unpckl*, start by extending each of
07125     // our (non-undef) elements to the full vector width with the element in the
07126     // bottom slot of the vector (which generates no code for SSE).
07127     for (unsigned i = 0; i < NumElems; ++i) {
07128       if (Op.getOperand(i).getOpcode() != ISD::UNDEF)
07129         V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
07130       else